diff --git a/batch.go b/batch.go
index 77955ed..60e00ca 100644
--- a/batch.go
+++ b/batch.go
@@ -197,6 +197,9 @@ func (ctx *BatchedContext) enqueue(c call) (retVal DevicePtr, err error) {
 // WorkAvailable returns the chan where work availability is broadcasted on.
 func (ctx *BatchedContext) WorkAvailable() <-chan struct{} { return ctx.workAvailable }
 
+// Signal is used to tell the context that work is available
+func (ctx *BatchedContext) Signal() { ctx.workAvailable <- struct{}{} }
+
 // DoWork waits for work to come in from the queue. If it's blocking, the entire queue will be processed immediately.
 // Otherwise it will be added to the batch queue.
 func (ctx *BatchedContext) DoWork() {
diff --git a/batch_test.go b/batch_test.go
index 4cb0689..302d854 100644
--- a/batch_test.go
+++ b/batch_test.go
@@ -3,8 +3,11 @@ package cu
 import (
 	"log"
 	"runtime"
+	"sync/atomic"
 	"testing"
 	"unsafe"
+
+	_ "net/http/pprof"
 )
 
 func TestBatchContext(t *testing.T) {
@@ -123,8 +126,8 @@ func TestLargeBatch(t *testing.T) {
 
 	dev.TotalMem()
 
-	beforeFree, _, _ := MemInfo()
 	ctx := newContext(cuctx)
+	beforeFree, _, _ := MemInfo()
 	bctx := NewBatchedContext(ctx, dev)
 
 	runtime.LockOSThread()
@@ -140,7 +143,8 @@ func TestLargeBatch(t *testing.T) {
 	}
 	size := int64(len(a) * 4)
 
-	go func() {
+	var freeCount uint32
+	go func(fc *uint32) {
 		var memA, memB DevicePtr
 		var frees []DevicePtr
 
@@ -175,13 +179,13 @@ func TestLargeBatch(t *testing.T) {
 
 		bctx.MemcpyDtoH(unsafe.Pointer(&a[0]), memA, size)
 		bctx.MemcpyDtoH(unsafe.Pointer(&b[0]), memB, size)
-		log.Printf("Number of frees %v", len(frees))
 		for _, free := range frees {
 			bctx.MemFree(free)
 		}
+		atomic.AddUint32(fc, uint32(len(frees)))
 		bctx.workAvailable <- struct{}{}
 		doneChan <- struct{}{}
-	}()
+	}(&freeCount)
 
 loop:
 	for {
@@ -205,14 +209,18 @@ loop:
 			break
 		}
 	}
-
+	mod.Unload()
 	afterFree, _, _ := MemInfo()
+	cuctx.Destroy()
+	runtime.GC()
 
+	if freeCount != 16114 {
+		t.Errorf("Expected 16114 frees. Got %d instead", freeCount)
+	}
 	if afterFree != beforeFree {
-		t.Errorf("Before: Freemem: %v. After %v | Diff %v", beforeFree, afterFree, (beforeFree-afterFree)/1024)
+		t.Logf("Before: Freemem: %v. After %v | Diff %v", beforeFree, afterFree, (beforeFree-afterFree)/1024)
 	}
-	mod.Unload()
-	cuctx.Destroy()
+
 }
 
 func BenchmarkNoBatching(bench *testing.B) {
@@ -285,6 +293,10 @@ func BenchmarkNoBatching(bench *testing.B) {
 				bench.Fatalf("Failed to copy memory to b: %v", err)
 			}
 		}
+		// useful for checking results
+		// if i == 0 {
+		// 	bench.Logf("%v", a[:10])
+		// }
 	}
 	MemFree(memA)
 	MemFree(memB)
@@ -347,20 +359,39 @@ func BenchmarkBatching(bench *testing.B) {
 	workAvailable := bctx.WorkAvailable()
 	for i := 0; i < bench.N; i++ {
 		for j := 0; j < 100; j++ {
-			select {
-			case <-workAvailable:
-				bctx.DoWork()
-			default:
+			done := make(chan struct{}, 1)
+			go func(done chan struct{}) {
 				bctx.MemcpyHtoD(memA, unsafe.Pointer(&a[0]), size)
 				bctx.MemcpyHtoD(memB, unsafe.Pointer(&b[0]), size)
 				bctx.LaunchKernel(fn, 100, 10, 1, 1000, 1, 1, 0, Stream{}, args)
 				bctx.Synchronize()
 				bctx.MemcpyDtoH(unsafe.Pointer(&a[0]), memA, size)
 				bctx.MemcpyDtoH(unsafe.Pointer(&b[0]), memB, size)
+				bctx.Signal()
+				done <- struct{}{}
+			}(done)
+
+		work:
+			for {
+				select {
+				case <-workAvailable:
+					bctx.DoWork()
+				case <-done:
+					break work
+				}
 			}
+
+		}
+
+		if err := bctx.Errors(); err != nil {
+			bench.Fatalf("Failed with errors in benchmark %d. Error: %v", i, err)
 		}
-	}
 
+		// useful for checking results
+		// if i == 0 {
+		// 	bench.Logf("%v", a[:10])
+		// }
+	}
 	MemFree(memA)
 	MemFree(memB)
 	mod.Unload()
diff --git a/blas/cgoflags.go b/blas/cgoflags.go
index d2af2f4..d2e95bc 100644
--- a/blas/cgoflags.go
+++ b/blas/cgoflags.go
@@ -2,5 +2,5 @@ package cublas
 
 // #cgo CFLAGS: -I/usr/local/cuda-9.0/targets/x86_64-linux/include -I/usr/local/cuda/include
 // #cgo LDFLAGS: -lcublas
-// #cgo LDFLAGS: -L/usr/local/cuda-9.0/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 
+// #cgo LDFLAGS: -L/usr/local/cuda-9.0/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/lib/x86_64-linux-gnu
 import "C"
diff --git a/blas/example_test.go b/blas/example_test.go
index 11a88b8..d636dca 100644
--- a/blas/example_test.go
+++ b/blas/example_test.go
@@ -8,7 +8,7 @@ import (
 	"github.com/pkg/errors"
 	"gonum.org/v1/gonum/blas"
 	"gorgonia.org/cu"
-	"gorgonia.org/cu/blas"
+	cublas "gorgonia.org/cu/blas"
 	"gorgonia.org/tensor"
 )
 
@@ -77,9 +77,8 @@ func (e *Engine) ContextErr() error { return e.ctx.Error() }
 
 type foomem []float64
 
-func (m foomem) Uintptr() uintptr        { return uintptr(unsafe.Pointer(&m[0])) }
-func (m foomem) Pointer() unsafe.Pointer { return unsafe.Pointer(&m[0]) }
-func (m foomem) MemSize() uintptr        { return uintptr(len(m) * 8) }
+func (m foomem) Uintptr() uintptr { return uintptr(unsafe.Pointer(&m[0])) }
+func (m foomem) MemSize() uintptr { return uintptr(len(m) * 8) }
 
 func (e *Engine) checkThreeFloat(a, b, ret tensor.Tensor) (ad, bd, retVal *tensor.Dense, err error) {
 	if /*a.IsNativelyAccessible() &&*/ !a.IsManuallyManaged() {
diff --git a/cgoflags.go b/cgoflags.go
index 9135dca..2089b78 100644
--- a/cgoflags.go
+++ b/cgoflags.go
@@ -5,21 +5,14 @@ package cu
 //#cgo LDFLAGS:-lcuda
 //
 ////default location:
-//#cgo linux,windows LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib
-//#cgo linux,windows CFLAGS: -I/usr/local/cuda/include/
+//#cgo linux LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib
+//#cgo linux CFLAGS: -I/usr/local/cuda/include
 //
 ////default location if not properly symlinked:
+//#cgo linux LDFLAGS:-L/usr/local/cuda-11.0/targets/x86_64-linux/lib
 //#cgo linux LDFLAGS:-L/usr/local/cuda-10.2/lib64 -L/usr/local/cuda-10.2/lib
-//#cgo linux LDFLAGS:-L/usr/local/cuda-10.1/lib64 -L/usr/local/cuda-10.1/lib
-//#cgo linux LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib
-//#cgo linux LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib
-//#cgo linux LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib
+//#cgo linux CFLAGS: -I/usr/local/cuda-11.0/targets/x86_64-linux/include
 //#cgo linux CFLAGS: -I/usr/local/cuda-10.2/include/
-//#cgo linux CFLAGS: -I/usr/local/cuda-10.1/include/
-//#cgo linux CFLAGS: -I/usr/local/cuda-6.0/include/
-//#cgo linux CFLAGS: -I/usr/local/cuda-5.5/include/
-//#cgo linux CFLAGS: -I/usr/local/cuda-5.0/include/
-//
 ////Ubuntu 15.04:
 //#cgo linux LDFLAGS:-L/usr/lib/x86_64-linux-gnu/
 //#cgo linux CFLAGS: -I/usr/include
diff --git a/cmd/cublas_integration_example/engine.go b/cmd/cublas_integration_example/engine.go
index c950f2f..9acf700 100644
--- a/cmd/cublas_integration_example/engine.go
+++ b/cmd/cublas_integration_example/engine.go
@@ -5,7 +5,7 @@ import (
 	"unsafe"
 
 	"gorgonia.org/cu"
-	"gorgonia.org/cu/blas"
+	cublas "gorgonia.org/cu/blas"
 	"gorgonia.org/tensor"
 )
 
@@ -74,6 +74,5 @@ func (e *Engine) ContextErr() error { return e.ctx.Error() }
 
 type foomem []float64
 
-func (m foomem) Uintptr() uintptr        { return uintptr(unsafe.Pointer(&m[0])) }
-func (m foomem) Pointer() unsafe.Pointer { return unsafe.Pointer(&m[0]) }
-func (m foomem) MemSize() uintptr        { return uintptr(len(m) * 8) }
+func (m foomem) Uintptr() uintptr { return uintptr(unsafe.Pointer(&m[0])) }
+func (m foomem) MemSize() uintptr { return uintptr(len(m) * 8) }
diff --git a/cmd/gencublas/binding.go b/cmd/gencublas/binding.go
index 39585d0..2042d32 100644
--- a/cmd/gencublas/binding.go
+++ b/cmd/gencublas/binding.go
@@ -12,9 +12,9 @@ import (
 	"strings"
 	"text/template"
 
-	"github.com/cznic/cc"
-	"github.com/cznic/xc"
 	bg "github.com/gorgonia/bindgen"
+	"modernc.org/cc"
+	"modernc.org/xc"
 )
 
 var goTypes = map[bg.TypeKey]bg.Template{
diff --git a/cmd/gencublas/main.go b/cmd/gencublas/main.go
index 5419d95..921592d 100644
--- a/cmd/gencublas/main.go
+++ b/cmd/gencublas/main.go
@@ -17,8 +17,8 @@ import (
 	"path"
 	"strings"
 
-	"github.com/cznic/cc"
 	bg "github.com/gorgonia/bindgen"
+	"modernc.org/cc"
 )
 
 var (
diff --git a/cmd/gencudnn/README.md b/cmd/gencudnn/README.md
index cfb3e44..9db8b46 100644
--- a/cmd/gencudnn/README.md
+++ b/cmd/gencudnn/README.md
@@ -34,4 +34,17 @@ cudnnStatus_t cudnnGetCallback(
                                 unsigned *mask,
                                 void **udata,
                                 cudnnCallback_t *fptr);
-	```
\ No newline at end of file
+	```
+
+
+# TODOs
+
+## Stubs ##
+
+*  /home/chewxy/workspace/gorgoniaws/src/gorgonia.org/cu/dnn/generated_ctcloss.go. TODO: true
+*  ~~/home/chewxy/workspace/gorgoniaws/src/gorgonia.org/cu/dnn/generated_spatialtransformer.go. TODO: true~~
+*  ~~/home/chewxy/workspace/gorgoniaws/src/gorgonia.org/cu/dnn/generated_seqdata.go. TODO: true~~
+*  ~~/home/chewxy/workspace/gorgoniaws/src/gorgonia.org/cu/dnn/generated_backend.go. TODO: true~~
+*  ~~/home/chewxy/workspace/gorgoniaws/src/gorgonia.org/cu/dnn/generated_rnndata.go. TODO: true~~
+*  ~~/home/chewxy/workspace/gorgoniaws/src/gorgonia.org/cu/dnn/generated_tensortransform.go. TODO: true~~
+*  /home/chewxy/workspace/gorgoniaws/src/gorgonia.org/cu/dnn/generated_algorithmdescriptor.go. TODO: true
diff --git a/cmd/gencudnn/conversion.go b/cmd/gencudnn/conversion.go
index ed5f80a..03437ea 100644
--- a/cmd/gencudnn/conversion.go
+++ b/cmd/gencudnn/conversion.go
@@ -102,6 +102,7 @@ func csig2gosig(cs *bg.CSignature, retVal *GoSignature) (*GoSignature, error) {
 
 	ioParamList := ioParams[cs.Name]
 	for i, p := range params {
+
 		_, isRetVal := retValPos[i]
 		name := p.Name()
 		typeName := goNameOf(p.Type())
diff --git a/cmd/gencudnn/declarations.go b/cmd/gencudnn/declarations.go
index 4545013..221af7a 100644
--- a/cmd/gencudnn/declarations.go
+++ b/cmd/gencudnn/declarations.go
@@ -1,5 +1,7 @@
 package main
 
+import "strings"
+
 var empty struct{}
 
 var ignoredEnums = map[string]struct{}{
@@ -53,6 +55,17 @@ var ctypes2GoTypes = map[string]string{
 	"cudnnDropoutDescriptor_t":            "Dropout",
 	"cudnnRNNDescriptor_t":                "RNN",
 	"cudnnPersistentRNNPlan_t":            "PersistentRNNPlan",
+
+	// cuda11
+	"cudnnFusedOpsVariantParamPack_t":  "FusedOpVariantParams",
+	"cudnnFusedOpsConstParamPack_t":    "FusedOpConsts",
+	"cudnnSeqDataDescriptor_t":         "SeqData",
+	"cudnnTensorTransformDescriptor_t": "TensorTransform",
+	"cudnnAlgorithmDescriptor_t":       "AlgorithmDescriptor",
+	"cudnnAlgorithmPerformance_t":      "AlgorithmPerformance",
+	"cudnnBackendDescriptor_t":         "Backend",
+	"cudnnRNNDataDescriptor_t":         "RNNData",
+	"cudnnAttnDescriptor_t":            "Attention",
 }
 
 var alphaBetaParams = []string{
@@ -69,6 +82,8 @@ var builtins = map[string]string{
 	"unsigned long long": "uint64",
 
 	"size_t": "uintptr",
+
+	"int64_t": "int64",
 }
 
 var go2cBuiltins = map[string]string{
@@ -80,6 +95,8 @@ var go2cBuiltins = map[string]string{
 	"uint64":  "ulonglong",
 
 	"uintptr": "size_t",
+
+	"int64": "int64_t",
 }
 
 var nonPrimitives = map[string]string{
@@ -101,3 +118,13 @@ var fnParamTypes = map[string]map[string]string{
 	"cudnnFindConvolutionBackwardDataAlgorithm":     {"returnedAlgoCount": "int"},
 	"cudnnFindConvolutionBackwardDataAlgorithmEx":   {"returnedAlgoCount": "int"},
 }
+
+var deprecated = make(map[string]struct{})
+
+func init() {
+	for n, doc := range docs {
+		if strings.Contains(doc, "has been deprecated in cuDNN 8.0.") {
+			deprecated[n] = struct{}{}
+		}
+	}
+}
diff --git a/cmd/gencudnn/generatethis.go b/cmd/gencudnn/generatethis.go
index dcddc30..17d1c53 100644
--- a/cmd/gencudnn/generatethis.go
+++ b/cmd/gencudnn/generatethis.go
@@ -6,15 +6,15 @@ import (
 	"os"
 	"strings"
 
-	"github.com/cznic/cc"
 	"github.com/gorgonia/bindgen"
 	"github.com/kr/pretty"
+	"modernc.org/cc"
 )
 
 // generate this contains function to generate for THIS package (main)
 
 // generateMappings is used to generate the mappings
-func generateMappings(appendCurrent bool) {
+func generateMappings(appendCurrent bool, fns ...func(buf io.WriteCloser, t *cc.TranslationUnit)) {
 	hdr := "package main\n"
 
 	initfn := `
@@ -38,17 +38,11 @@ func generateMappings(appendCurrent bool) {
 		fmt.Fprintln(buf, hdr)
 		bindgen.GenIgnored(buf, t, functions)
 		fmt.Fprintln(buf, initfn)
-		bindgen.GenNameMap(buf, t, "fnNameMap", processNameBasic, functions, true)
-		bindgen.GenNameMap(buf, t, "enumMappings", processNameBasic, enums, true)
-
-		generateCRUD(buf, t, "create")
-		generateCRUD(buf, t, "set")
-		generateCRUD(buf, t, "destroy")
-		generateCRUD(buf, t, "methods")
-		fmt.Fprintln(buf, "}\n")
 	}
-	generateAlphaBeta(buf, t)
-	fmt.Fprintln(buf, initfn)
+
+	for _, fn := range fns {
+		fn(buf, t)
+	}
 	fmt.Fprintln(buf, "}\n")
 }
 
diff --git a/cmd/gencudnn/main.go b/cmd/gencudnn/main.go
index ba6d234..b91f00a 100644
--- a/cmd/gencudnn/main.go
+++ b/cmd/gencudnn/main.go
@@ -11,8 +11,8 @@ import (
 	"os"
 	"path"
 
-	"github.com/cznic/cc"
 	"github.com/gorgonia/bindgen"
+	"modernc.org/cc"
 )
 
 var pkgloc string
@@ -53,8 +53,7 @@ func goimports(filename string) error {
 
 func main() {
 	var pkg *PkgState
-	// pkg = parsePkg(false)
-
+	pkg = parsePkg(false)
 	// Step 0: run parse.py to get more sanity about inputs and outputs
 	// Step 1: Explore
 	// explore(hdrfile, functions, enums, otherTypes)
@@ -64,17 +63,26 @@ func main() {
 	// Step 2: generate mappings for this package, then edit them manually
 	// 	Specifically, the `ignored` map is edited - things that will be manually written are not removed from the list
 	//	Some enum map names may also be changed
-	generateMappings(true)
+	//defaultPipeline := func(buf io.WriteCloser, t *cc.TranslationUnit) {
+	// bindgen.GenNameMap(buf, t, "fnNameMap", processNameBasic, functions, true)
+	// bindgen.GenNameMap(buf, t, "enumMappings", processNameBasic, enums, true)
+	// generateAlphaBeta(buf, t)
+	// generateCRUD(buf, t, "create")
+	// generateCRUD(buf, t, "set")
+	// generateCRUD(buf, t, "destroy")
+	//generateCRUD(buf, t, "methods")
+	//}
+	//generateMappings(true, defaultPipeline)
 
 	// Step 3: generate enums, then edit the file in the dnn package.
-	// generateEnums()
-	// generateEnumStrings()
-	// generateStubs(false, pkg) // true/false indicates debug mode
+	//generateEnums()
+	//generateEnumStrings()
+	//generateStubs(false, pkg) // true/false indicates debug mode
 
 	// Step 4: manual fix for inconsistent names (Spatial Transforms)
 
 	// step 5:
-	// generateFunctions(pkg)
+	generateFunctions(pkg)
 
 	// report things that aren't done yet
 	pkg = parsePkg(true)
@@ -177,7 +185,7 @@ func generateEnums() {
 		if isIgnored(e.Name) {
 			continue
 		}
-		fmt.Fprintf(buf, "type %v int\nconst (\n", enumMappings[e.Name], enumMappings[e.Name])
+		fmt.Fprintf(buf, "type %v int\nconst (\n", enumMappings[e.Name])
 
 		var names []string
 		for _, a := range e.Type.EnumeratorList() {
@@ -270,6 +278,7 @@ outer:
 			continue
 		}
 		if alreadyProcessedType(gotype, decls) {
+			log.Printf("Already processed %v", gotype)
 			continue
 		}
 
@@ -439,13 +448,17 @@ func generateFunctions(pkg *PkgState) {
 			if _, ok := ignored[name]; ok {
 				continue
 			}
+			if _, ok := deprecated[name]; ok {
+				log.Printf("Ignoring %v becasue it's been deprecated", csig.Name)
+				continue
+			}
+
 			sig := GoSignature{}
 			sig.Receiver.Name = strings.ToLower(depointerize(goNameOfStr(rec))[0:2])
 			sig.Receiver.Type = reqPtr(goNameOfStr(rec))
 			sig.Name = fnNameMap[name]
 
 			_, err := csig2gosig(csig, &sig)
-
 			fmt.Fprintf(buf, "%v { \n", sig)
 			if err != nil {
 				fmt.Fprintf(buf, "// DOUBLECHECK: %v\n", err)
diff --git a/cmd/gencudnn/mappings.go b/cmd/gencudnn/mappings.go
index 1c390fe..f32838f 100644
--- a/cmd/gencudnn/mappings.go
+++ b/cmd/gencudnn/mappings.go
@@ -1,430 +1,709 @@
 package main
 
 var ignored = map[string]struct{}{
-	"cudnnGetVersion":        {},
-	"cudnnGetCudartVersion":  {},
-	"cudnnGetErrorString":    {},
-	"cudnnQueryRuntimeError": {},
-	"cudnnGetProperty":       {},
-	"cudnnCreate":            {},
-	"cudnnDestroy":           {},
-	"cudnnSetStream":         {},
-	"cudnnGetStream":         {},
-	// "cudnnCreateTensorDescriptor":  {},
-	// "cudnnSetTensor4dDescriptor":   {},
-	// "cudnnSetTensor4dDescriptorEx": {},
-	"cudnnGetTensor4dDescriptor": {},
-	// "cudnnSetTensorNdDescriptor":   {},
-	// "cudnnSetTensorNdDescriptorEx": {},
-	"cudnnGetTensorNdDescriptor": {},
-	"cudnnGetTensorSizeInBytes":  {},
-	// "cudnnDestroyTensorDescriptor": {},
-	// "cudnnTransformTensor":         {},
-	// "cudnnAddTensor":                                     {},
-	// "cudnnCreateOpTensorDescriptor":  {},
-	// "cudnnSetOpTensorDescriptor":     {},
-	"cudnnGetOpTensorDescriptor": {},
-	// "cudnnDestroyOpTensorDescriptor": {},
-	// "cudnnOpTensor":                                      {},
-	// "cudnnCreateReduceTensorDescriptor": {},
-	// "cudnnSetReduceTensorDescriptor":    {},
+	// "cudnnActivationBackward":{}, //
+	// "cudnnActivationForward":{}, //
+	// "cudnnAddTensor":{}, //
+	"cudnnAdvInferVersionCheck":     {},
+	"cudnnAdvTrainVersionCheck":     {},
+	"cudnnBackendCreateDescriptor":  {},
+	"cudnnBackendDestroyDescriptor": {},
+	"cudnnBackendExecute":           {},
+	"cudnnBackendFinalize":          {},
+	"cudnnBackendGetAttribute":      {},
+	"cudnnBackendInitialize":        {},
+	// "cudnnBackendSetAttribute":      {},
+	// "cudnnBatchNormalizationBackward":{}, //
+	"cudnnBatchNormalizationBackwardEx": {},
+	// "cudnnBatchNormalizationForwardInference":{}, //
+	// "cudnnBatchNormalizationForwardTraining":{}, //
+	"cudnnBatchNormalizationForwardTrainingEx": {},
+	"cudnnBuildRNNDynamic":                     {},
+	// "cudnnCTCLoss":{}, //
+	"cudnnCTCLoss_v8":           {},
+	"cudnnCnnInferVersionCheck": {},
+	"cudnnCnnTrainVersionCheck": {},
+	// "cudnnConvolutionBackwardBias":{}, //
+	// "cudnnConvolutionBackwardData":{}, //
+	// "cudnnConvolutionBackwardFilter":{}, //
+	// "cudnnConvolutionBiasActivationForward":{}, //
+	// "cudnnConvolutionForward":{}, //
+	"cudnnCopyAlgorithmDescriptor": {},
+	"cudnnCreate":                  {},
+	// "cudnnCreateActivationDescriptor":{}, //
+	"cudnnCreateAlgorithmDescriptor":  {},
+	"cudnnCreateAlgorithmPerformance": {},
+	"cudnnCreateAttnDescriptor":       {},
+	// "cudnnCreateCTCLossDescriptor":{}, //
+	"cudnnCreateConvolutionDescriptor": {},
+	// "cudnnCreateDropoutDescriptor":{}, //
+	// "cudnnCreateFilterDescriptor":{}, //
+	"cudnnCreateFusedOpsConstParamPack":   {},
+	"cudnnCreateFusedOpsPlan":             {},
+	"cudnnCreateFusedOpsVariantParamPack": {},
+	// "cudnnCreateLRNDescriptor":{}, //
+	// "cudnnCreateOpTensorDescriptor":{}, //
+	// "cudnnCreatePersistentRNNPlan":{}, //
+	// "cudnnCreatePoolingDescriptor":{}, //
+	"cudnnCreateRNNDataDescriptor": {},
+	// "cudnnCreateRNNDescriptor":{}, //
+	// "cudnnCreateReduceTensorDescriptor":{}, //
+	"cudnnCreateSeqDataDescriptor": {},
+	// "cudnnCreateSpatialTransformerDescriptor":{}, //
+	// "cudnnCreateTensorDescriptor":{}, //
+	"cudnnCreateTensorTransformDescriptor": {},
+	// "cudnnDeriveBNTensorDescriptor":{}, //
+	"cudnnDeriveNormTensorDescriptor": {},
+	"cudnnDestroy":                    {},
+	// "cudnnDestroyActivationDescriptor":{}, //
+	"cudnnDestroyAlgorithmDescriptor":  {},
+	"cudnnDestroyAlgorithmPerformance": {},
+	"cudnnDestroyAttnDescriptor":       {},
+	// "cudnnDestroyCTCLossDescriptor":{}, //
+	"cudnnDestroyConvolutionDescriptor": {},
+	// "cudnnDestroyDropoutDescriptor":{}, //
+	// "cudnnDestroyFilterDescriptor":{}, //
+	"cudnnDestroyFusedOpsConstParamPack":   {},
+	"cudnnDestroyFusedOpsPlan":             {},
+	"cudnnDestroyFusedOpsVariantParamPack": {},
+	// "cudnnDestroyLRNDescriptor":{}, //
+	// "cudnnDestroyOpTensorDescriptor":{}, //
+	// "cudnnDestroyPersistentRNNPlan":{}, //
+	// "cudnnDestroyPoolingDescriptor":{}, //
+	"cudnnDestroyRNNDataDescriptor": {},
+	// "cudnnDestroyRNNDescriptor":{}, //
+	// "cudnnDestroyReduceTensorDescriptor":{}, //
+	"cudnnDestroySeqDataDescriptor": {},
+	// "cudnnDestroySpatialTransformerDescriptor":{}, //
+	// "cudnnDestroyTensorDescriptor":{}, //
+	"cudnnDestroyTensorTransformDescriptor": {},
+	// "cudnnDivisiveNormalizationBackward":{}, //
+	// "cudnnDivisiveNormalizationForward":{}, //
+	// "cudnnDropoutBackward":{}, //
+	// "cudnnDropoutForward":{}, //
+	// "cudnnDropoutGetReserveSpaceSize":{}, //
+	// "cudnnDropoutGetStatesSize":{}, //
+	// "cudnnFindConvolutionBackwardDataAlgorithm":{}, //
+	// "cudnnFindConvolutionBackwardDataAlgorithmEx":{}, //
+	// "cudnnFindConvolutionBackwardFilterAlgorithm":{}, //
+	// "cudnnFindConvolutionBackwardFilterAlgorithmEx":{}, //
+	// "cudnnFindConvolutionForwardAlgorithm":{}, //
+	// "cudnnFindConvolutionForwardAlgorithmEx":{}, //
+	"cudnnFindRNNBackwardDataAlgorithmEx":                      {},
+	"cudnnFindRNNBackwardWeightsAlgorithmEx":                   {},
+	"cudnnFindRNNForwardInferenceAlgorithmEx":                  {},
+	"cudnnFindRNNForwardTrainingAlgorithmEx":                   {},
+	"cudnnFusedOpsExecute":                                     {},
+	"cudnnGetActivationDescriptor":                             {},
+	"cudnnGetAlgorithmDescriptor":                              {},
+	"cudnnGetAlgorithmPerformance":                             {},
+	"cudnnGetAlgorithmSpaceSize":                               {},
+	"cudnnGetAttnDescriptor":                                   {},
+	"cudnnGetBatchNormalizationBackwardExWorkspaceSize":        {},
+	"cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize": {},
+	"cudnnGetBatchNormalizationTrainingExReserveSpaceSize":     {},
+	"cudnnGetCTCLossDescriptor":                                {},
+	"cudnnGetCTCLossDescriptorEx":                              {},
+	"cudnnGetCTCLossDescriptor_v8":                             {},
+	"cudnnGetCTCLossWorkspaceSize":                             {},
+	"cudnnGetCTCLossWorkspaceSize_v8":                          {},
+	"cudnnGetCallback":                                         {},
+	"cudnnGetConvolution2dDescriptor":                          {},
+	"cudnnGetConvolution2dForwardOutputDim":                    {},
+	"cudnnGetConvolutionBackwardDataAlgorithmMaxCount":         {},
+	"cudnnGetConvolutionBackwardDataAlgorithm_v7":              {},
+	"cudnnGetConvolutionBackwardDataWorkspaceSize":             {},
+	"cudnnGetConvolutionBackwardFilterAlgorithmMaxCount":       {},
+	"cudnnGetConvolutionBackwardFilterAlgorithm_v7":            {},
+	"cudnnGetConvolutionBackwardFilterWorkspaceSize":           {},
+	"cudnnGetConvolutionForwardAlgorithmMaxCount":              {},
+	"cudnnGetConvolutionForwardAlgorithm_v7":                   {},
+	"cudnnGetConvolutionForwardWorkspaceSize":                  {},
+	"cudnnGetConvolutionGroupCount":                            {},
+	"cudnnGetConvolutionMathType":                              {},
+	"cudnnGetConvolutionNdDescriptor":                          {},
+	"cudnnGetConvolutionNdForwardOutputDim":                    {},
+	"cudnnGetConvolutionReorderType":                           {},
+	"cudnnGetCudartVersion":                                    {},
+	"cudnnGetDropoutDescriptor":                                {},
+	"cudnnGetErrorString":                                      {},
+	"cudnnGetFilter4dDescriptor":                               {},
+	"cudnnGetFilterNdDescriptor":                               {},
+	"cudnnGetFilterSizeInBytes":                                {},
+	"cudnnGetFoldedConvBackwardDataDescriptors":                {},
+	"cudnnGetFusedOpsConstParamPackAttribute":                  {},
+	"cudnnGetFusedOpsVariantParamPackAttribute":                {},
+	"cudnnGetLRNDescriptor":                                    {},
+	"cudnnGetMultiHeadAttnBuffers":                             {},
+	"cudnnGetMultiHeadAttnWeights":                             {},
+	"cudnnGetNormalizationBackwardWorkspaceSize":               {},
+	"cudnnGetNormalizationForwardTrainingWorkspaceSize":        {},
+	"cudnnGetNormalizationTrainingReserveSpaceSize":            {},
+	"cudnnGetOpTensorDescriptor":                               {},
+	"cudnnGetPooling2dDescriptor":                              {},
+	"cudnnGetPooling2dForwardOutputDim":                        {},
+	"cudnnGetPoolingNdDescriptor":                              {},
+	"cudnnGetPoolingNdForwardOutputDim":                        {},
+	"cudnnGetProperty":                                         {},
+	"cudnnGetRNNBackwardDataAlgorithmMaxCount":                 {},
+	"cudnnGetRNNBackwardWeightsAlgorithmMaxCount":              {},
+	"cudnnGetRNNBiasMode":                                      {},
+	"cudnnGetRNNDataDescriptor":                                {},
+	"cudnnGetRNNDescriptor_v6":                                 {},
+	"cudnnGetRNNDescriptor_v8":                                 {},
+	"cudnnGetRNNForwardInferenceAlgorithmMaxCount":             {},
+	"cudnnGetRNNForwardTrainingAlgorithmMaxCount":              {},
+	"cudnnGetRNNLinLayerBiasParams":                            {}, //
+	"cudnnGetRNNLinLayerMatrixParams":                          {}, //
+	"cudnnGetRNNMatrixMathType":                                {},
+	"cudnnGetRNNPaddingMode":                                   {},
+	// "cudnnGetRNNParamsSize":{}, //
+	"cudnnGetRNNProjectionLayers": {},
+	"cudnnGetRNNTempSpaceSizes":   {},
+	// "cudnnGetRNNTrainingReserveSize":{}, //
+	"cudnnGetRNNWeightParams":    {},
+	"cudnnGetRNNWeightSpaceSize": {},
+	// "cudnnGetRNNWorkspaceSize":{}, //
 	"cudnnGetReduceTensorDescriptor": {},
-	// "cudnnDestroyReduceTensorDescriptor":                 {},
-	// "cudnnGetReductionIndicesSize":                       {},
-	// "cudnnGetReductionWorkspaceSize":                     {},
-	// "cudnnReduceTensor":                                  {},
-	// "cudnnSetTensor": {},
-	// "cudnnScaleTensor":                                   {},
-	// "cudnnCreateFilterDescriptor":                 {},
-	// "cudnnSetFilter4dDescriptor":                  {},
-	"cudnnGetFilter4dDescriptor": {},
-	// "cudnnSetFilterNdDescriptor":                  {},
-	"cudnnGetFilterNdDescriptor": {},
-	// "cudnnDestroyFilterDescriptor":                {},
-	"cudnnCreateConvolutionDescriptor":            {},
-	"cudnnSetConvolutionMathType":                 {},
-	"cudnnGetConvolutionMathType":                 {},
-	"cudnnSetConvolutionGroupCount":               {},
-	"cudnnGetConvolutionGroupCount":               {},
-	"cudnnSetConvolution2dDescriptor":             {},
-	"cudnnGetConvolution2dDescriptor":             {},
-	"cudnnGetConvolution2dForwardOutputDim":       {},
-	"cudnnSetConvolutionNdDescriptor":             {},
-	"cudnnGetConvolutionNdDescriptor":             {},
-	"cudnnGetConvolutionNdForwardOutputDim":       {},
-	"cudnnDestroyConvolutionDescriptor":           {},
-	"cudnnGetConvolutionForwardAlgorithmMaxCount": {},
-	// "cudnnFindConvolutionForwardAlgorithm":               {},
-	// "cudnnFindConvolutionForwardAlgorithmEx":             {},
-	"cudnnGetConvolutionForwardAlgorithm":     {},
-	"cudnnGetConvolutionForwardAlgorithm_v7":  {},
-	"cudnnGetConvolutionForwardWorkspaceSize": {},
-	// "cudnnConvolutionForward":                            {},
-	// "cudnnConvolutionBiasActivationForward":              {},
-	// "cudnnConvolutionBackwardBias":                       {},
-	"cudnnGetConvolutionBackwardFilterAlgorithmMaxCount": {},
-	// "cudnnFindConvolutionBackwardFilterAlgorithm":        {},
-	// "cudnnFindConvolutionBackwardFilterAlgorithmEx":      {},
-	"cudnnGetConvolutionBackwardFilterAlgorithm":     {},
-	"cudnnGetConvolutionBackwardFilterAlgorithm_v7":  {},
-	"cudnnGetConvolutionBackwardFilterWorkspaceSize": {},
-	// "cudnnConvolutionBackwardFilter":                     {},
-	"cudnnGetConvolutionBackwardDataAlgorithmMaxCount": {},
-	// "cudnnFindConvolutionBackwardDataAlgorithm":          {},
-	// "cudnnFindConvolutionBackwardDataAlgorithmEx":        {},
-	"cudnnGetConvolutionBackwardDataAlgorithm":     {},
-	"cudnnGetConvolutionBackwardDataAlgorithm_v7":  {},
-	"cudnnGetConvolutionBackwardDataWorkspaceSize": {},
-	// "cudnnConvolutionBackwardData":                       {},
-	// "cudnnIm2Col":                                        {},
-	// "cudnnSoftmaxForward":                                {},
-	// "cudnnSoftmaxBackward":                               {},
-	// "cudnnCreatePoolingDescriptor":                       {},
-	// "cudnnSetPooling2dDescriptor": {},
-	"cudnnGetPooling2dDescriptor": {},
-	// "cudnnSetPoolingNdDescriptor": {},
-	"cudnnGetPoolingNdDescriptor":       {},
-	"cudnnGetPoolingNdForwardOutputDim": {},
-	"cudnnGetPooling2dForwardOutputDim": {},
-	// "cudnnDestroyPoolingDescriptor":                      {},
-	// "cudnnPoolingForward":                                {},
-	// "cudnnPoolingBackward":                               {},
-	// "cudnnCreateActivationDescriptor":                    {},
-	// "cudnnSetActivationDescriptor": {},
-	"cudnnGetActivationDescriptor": {},
-	// "cudnnDestroyActivationDescriptor":                   {},
-	// "cudnnActivationForward":                             {},
-	// "cudnnActivationBackward":                            {},
-	// "cudnnCreateLRNDescriptor":                           {},
-	// "cudnnSetLRNDescriptor": {},
-	"cudnnGetLRNDescriptor": {},
-	// "cudnnDestroyLRNDescriptor":                          {},
-	// "cudnnLRNCrossChannelForward":                        {},
-	// "cudnnLRNCrossChannelBackward":                       {},
-	// "cudnnDivisiveNormalizationForward":                  {},
-	// "cudnnDivisiveNormalizationBackward":                 {},
-	// "cudnnDeriveBNTensorDescriptor":                      {},
-	// "cudnnBatchNormalizationForwardTraining":             {},
-	// "cudnnBatchNormalizationForwardInference":            {},
-	// "cudnnBatchNormalizationBackward":                    {},
-	// "cudnnCreateSpatialTransformerDescriptor":            {},
-	// "cudnnSetSpatialTransformerNdDescriptor": {},
-	// "cudnnDestroySpatialTransformerDescriptor":           {},
-	// "cudnnSpatialTfGridGeneratorForward":                 {},
-	// "cudnnSpatialTfGridGeneratorBackward":                {},
-	// "cudnnSpatialTfSamplerForward":                       {},
-	// "cudnnSpatialTfSamplerBackward":                      {},
-	// "cudnnCreateDropoutDescriptor":                       {},
-	// "cudnnDestroyDropoutDescriptor":                      {},
-	// "cudnnDropoutGetStatesSize":                          {},
-	// "cudnnDropoutGetReserveSpaceSize":                    {},
-	// "cudnnSetDropoutDescriptor": {},
-	// "cudnnRestoreDropoutDescriptor":                      {},
-	"cudnnGetDropoutDescriptor": {},
-	// "cudnnDropoutForward":                                {},
-	// "cudnnDropoutBackward":                               {},
-	// "cudnnCreateRNNDescriptor":                           {},
-	// "cudnnDestroyRNNDescriptor":                          {},
-	// "cudnnCreatePersistentRNNPlan":                       {},
-	// "cudnnSetPersistentRNNPlan": {},
-	// "cudnnDestroyPersistentRNNPlan":                      {},
-	// "cudnnSetRNNDescriptor": {},
-	"cudnnGetRNNDescriptor": {},
-	// "cudnnSetRNNMatrixMathType": {},
-	// "cudnnGetRNNWorkspaceSize":                           {},
-	// "cudnnGetRNNTrainingReserveSize":                     {},
-	// "cudnnGetRNNParamsSize":                              {},
-	// "cudnnGetRNNLinLayerMatrixParams":                    {},
-	// "cudnnGetRNNLinLayerBiasParams":                      {},
-	// "cudnnRNNForwardInference":                           {},
-	// "cudnnRNNForwardTraining":                            {},
-	// "cudnnRNNBackwardData":                               {},
-	// "cudnnRNNBackwardWeights":                            {},
-	// "cudnnCreateCTCLossDescriptor":                       {},
-	// "cudnnSetCTCLossDescriptor": {},
-	"cudnnGetCTCLossDescriptor": {},
-	// "cudnnDestroyCTCLossDescriptor":                      {},
-	// "cudnnCTCLoss":                                       {},
-	"cudnnGetCTCLossWorkspaceSize": {},
-	// "cudnnSetRNNDescriptor_v6": {},
-	// "cudnnSetRNNDescriptor_v5": {},
+	// "cudnnGetReductionIndicesSize":{}, //
+	// "cudnnGetReductionWorkspaceSize":{}, //
+	"cudnnGetSeqDataDescriptor":         {},
+	"cudnnGetStream":                    {},
+	"cudnnGetTensor4dDescriptor":        {},
+	"cudnnGetTensorNdDescriptor":        {},
+	"cudnnGetTensorSizeInBytes":         {},
+	"cudnnGetTensorTransformDescriptor": {},
+	"cudnnGetVersion":                   {},
+	// "cudnnIm2Col":{}, //
+	"cudnnInitTransformDest": {},
+	// "cudnnLRNCrossChannelBackward":{}, //
+	// "cudnnLRNCrossChannelForward":{}, //
+	"cudnnMakeFusedOpsPlan":              {},
+	"cudnnMultiHeadAttnBackwardData":     {},
+	"cudnnMultiHeadAttnBackwardWeights":  {},
+	"cudnnMultiHeadAttnForward":          {},
+	"cudnnNormalizationBackward":         {},
+	"cudnnNormalizationForwardInference": {},
+	"cudnnNormalizationForwardTraining":  {},
+	// "cudnnOpTensor":{}, //
+	"cudnnOpsInferVersionCheck": {},
+	"cudnnOpsTrainVersionCheck": {},
+	// "cudnnPoolingBackward":{}, //
+	// "cudnnPoolingForward":{}, //
+	"cudnnQueryRuntimeError": {},
+	// "cudnnRNNBackwardData":{}, //
+	"cudnnRNNBackwardDataEx":  {},
+	"cudnnRNNBackwardData_v8": {},
+	// "cudnnRNNBackwardWeights":{}, //
+	"cudnnRNNBackwardWeightsEx":  {},
+	"cudnnRNNBackwardWeights_v8": {},
+	"cudnnRNNForward":            {},
+	// "cudnnRNNForwardInference":{}, //
+	"cudnnRNNForwardInferenceEx": {},
+	"cudnnRNNForwardTraining":    {}, // looks to be deprecated
+	"cudnnRNNForwardTrainingEx":  {},
+	"cudnnRNNGetClip":            {},
+	"cudnnRNNGetClip_v8":         {},
+	"cudnnRNNSetClip":            {},
+	"cudnnRNNSetClip_v8":         {},
+	// "cudnnReduceTensor":{}, //
+	"cudnnReorderFilterAndBias": {},
+	"cudnnRestoreAlgorithm":     {},
+	// "cudnnRestoreDropoutDescriptor":{}, //
+	"cudnnSaveAlgorithm": {},
+	// "cudnnScaleTensor":{}, //
+	// "cudnnSetActivationDescriptor":{}, //
+	// "cudnnSetAlgorithmDescriptor":  {}, //
+	// "cudnnSetAlgorithmPerformance": {}, //
+	// "cudnnSetAttnDescriptor":       {}, //
+	// "cudnnSetCTCLossDescriptor":{}, //
+	//"cudnnSetCTCLossDescriptorEx":     {},//
+	//"cudnnSetCTCLossDescriptor_v8":    {},//
+	"cudnnSetCallback":                {},
+	"cudnnSetConvolution2dDescriptor": {},
+	"cudnnSetConvolutionGroupCount":   {},
+	"cudnnSetConvolutionMathType":     {},
+	"cudnnSetConvolutionNdDescriptor": {},
+	"cudnnSetConvolutionReorderType":  {},
+	// "cudnnSetDropoutDescriptor":{}, //
+	// "cudnnSetFilter4dDescriptor":{}, //
+	// "cudnnSetFilterNdDescriptor":{}, //
+	// "cudnnSetFusedOpsConstParamPackAttribute":   {}, //
+	// "cudnnSetFusedOpsVariantParamPackAttribute": {}, //
+	// "cudnnSetLRNDescriptor":{}, //
+	// "cudnnSetOpTensorDescriptor":{}, //
+	// "cudnnSetPersistentRNNPlan":{}, //
+	// "cudnnSetPooling2dDescriptor":{}, //
+	// "cudnnSetPoolingNdDescriptor":{}, //
+	"cudnnSetRNNAlgorithmDescriptor": {},
+	"cudnnSetRNNBiasMode":            {},
+	//"cudnnSetRNNDataDescriptor":      {}, //
+	// "cudnnSetRNNDescriptor_v6":{}, //
+	"cudnnSetRNNDescriptor_v8": {},
+	// "cudnnSetRNNMatrixMathType":{}, //
+	"cudnnSetRNNPaddingMode":      {},
+	"cudnnSetRNNProjectionLayers": {},
+	// "cudnnSetReduceTensorDescriptor":{}, //
+	// "cudnnSetSeqDataDescriptor": {}, //
+	// "cudnnSetSpatialTransformerNdDescriptor":{}, //
+	// "cudnnSetStream": {},//
+	// "cudnnSetTensor":{}, //
+	// "cudnnSetTensor4dDescriptor":{}, //
+	// "cudnnSetTensor4dDescriptorEx":{}, //
+	// "cudnnSetTensorNdDescriptor":{}, //
+	// "cudnnSetTensorNdDescriptorEx":{}, //
+	// "cudnnSetTensorTransformDescriptor": {}, //
+	// "cudnnSoftmaxBackward":{}, //
+	// "cudnnSoftmaxForward":{}, //
+	// "cudnnSpatialTfGridGeneratorBackward":{}, //
+	// "cudnnSpatialTfGridGeneratorForward":{}, //
+	// "cudnnSpatialTfSamplerBackward":{}, //
+	// "cudnnSpatialTfSamplerForward":{}, //
+	"cudnnTransformFilter": {},
+	// "cudnnTransformTensor":{}, //
+	"cudnnTransformTensorEx": {},
 }
 
 func init() {
+
 	fnNameMap = map[string]string{
-		"cudnnGetVersion":                                    "GetVersion",
-		"cudnnGetCudartVersion":                              "GetCudartVersion",
-		"cudnnGetErrorString":                                "GetErrorString",
-		"cudnnQueryRuntimeError":                             "QueryRuntimeError",
-		"cudnnGetProperty":                                   "GetProperty",
-		"cudnnCreate":                                        "Create",
-		"cudnnDestroy":                                       "Destroy",
-		"cudnnSetStream":                                     "SetStream",
-		"cudnnGetStream":                                     "GetStream",
-		"cudnnCreateTensorDescriptor":                        "CreateTensorDescriptor",
-		"cudnnSetTensor4dDescriptor":                         "SetTensor4dDescriptor",
-		"cudnnSetTensor4dDescriptorEx":                       "SetTensor4dDescriptorEx",
-		"cudnnGetTensor4dDescriptor":                         "GetTensor4dDescriptor",
-		"cudnnSetTensorNdDescriptor":                         "SetTensorNdDescriptor",
-		"cudnnSetTensorNdDescriptorEx":                       "SetTensorNdDescriptorEx",
-		"cudnnGetTensorNdDescriptor":                         "GetTensorNdDescriptor",
-		"cudnnGetTensorSizeInBytes":                          "GetTensorSizeInBytes",
-		"cudnnDestroyTensorDescriptor":                       "DestroyTensorDescriptor",
-		"cudnnTransformTensor":                               "TransformTensor",
-		"cudnnAddTensor":                                     "AddTensor",
-		"cudnnCreateOpTensorDescriptor":                      "CreateOpTensorDescriptor",
-		"cudnnSetOpTensorDescriptor":                         "SetOpTensorDescriptor",
-		"cudnnGetOpTensorDescriptor":                         "GetOpTensorDescriptor",
-		"cudnnDestroyOpTensorDescriptor":                     "DestroyOpTensorDescriptor",
-		"cudnnOpTensor":                                      "OpTensor",
-		"cudnnCreateReduceTensorDescriptor":                  "CreateReduceTensorDescriptor",
-		"cudnnSetReduceTensorDescriptor":                     "SetReduceTensorDescriptor",
-		"cudnnGetReduceTensorDescriptor":                     "GetReduceTensorDescriptor",
-		"cudnnDestroyReduceTensorDescriptor":                 "DestroyReduceTensorDescriptor",
-		"cudnnGetReductionIndicesSize":                       "GetReductionIndicesSize",
-		"cudnnGetReductionWorkspaceSize":                     "GetReductionWorkspaceSize",
-		"cudnnReduceTensor":                                  "ReduceTensor",
-		"cudnnSetTensor":                                     "SetTensor",
-		"cudnnScaleTensor":                                   "ScaleTensor",
-		"cudnnCreateFilterDescriptor":                        "CreateFilterDescriptor",
-		"cudnnSetFilter4dDescriptor":                         "SetFilter4dDescriptor",
-		"cudnnGetFilter4dDescriptor":                         "GetFilter4dDescriptor",
-		"cudnnSetFilterNdDescriptor":                         "SetFilterNdDescriptor",
-		"cudnnGetFilterNdDescriptor":                         "GetFilterNdDescriptor",
-		"cudnnDestroyFilterDescriptor":                       "DestroyFilterDescriptor",
-		"cudnnCreateConvolutionDescriptor":                   "CreateConvolutionDescriptor",
-		"cudnnSetConvolutionMathType":                        "SetConvolutionMathType",
-		"cudnnGetConvolutionMathType":                        "GetConvolutionMathType",
-		"cudnnSetConvolutionGroupCount":                      "SetConvolutionGroupCount",
-		"cudnnGetConvolutionGroupCount":                      "GetConvolutionGroupCount",
-		"cudnnSetConvolution2dDescriptor":                    "SetConvolution2dDescriptor",
-		"cudnnGetConvolution2dDescriptor":                    "GetConvolution2dDescriptor",
-		"cudnnGetConvolution2dForwardOutputDim":              "GetConvolution2dForwardOutputDim",
-		"cudnnSetConvolutionNdDescriptor":                    "SetConvolutionNdDescriptor",
-		"cudnnGetConvolutionNdDescriptor":                    "GetConvolutionNdDescriptor",
-		"cudnnGetConvolutionNdForwardOutputDim":              "GetConvolutionNdForwardOutputDim",
-		"cudnnDestroyConvolutionDescriptor":                  "DestroyConvolutionDescriptor",
-		"cudnnGetConvolutionForwardAlgorithmMaxCount":        "GetConvolutionForwardAlgorithmMaxCount",
-		"cudnnFindConvolutionForwardAlgorithm":               "FindConvolutionForwardAlgorithm",
-		"cudnnFindConvolutionForwardAlgorithmEx":             "FindConvolutionForwardAlgorithmEx",
-		"cudnnGetConvolutionForwardAlgorithm":                "GetConvolutionForwardAlgorithm",
-		"cudnnGetConvolutionForwardAlgorithm_v7":             "GetConvolutionForwardAlgorithm_v7",
-		"cudnnGetConvolutionForwardWorkspaceSize":            "GetConvolutionForwardWorkspaceSize",
-		"cudnnConvolutionForward":                            "ConvolutionForward",
-		"cudnnConvolutionBiasActivationForward":              "ConvolutionBiasActivationForward",
-		"cudnnConvolutionBackwardBias":                       "ConvolutionBackwardBias",
-		"cudnnGetConvolutionBackwardFilterAlgorithmMaxCount": "GetConvolutionBackwardFilterAlgorithmMaxCount",
-		"cudnnFindConvolutionBackwardFilterAlgorithm":        "FindConvolutionBackwardFilterAlgorithm",
-		"cudnnFindConvolutionBackwardFilterAlgorithmEx":      "FindConvolutionBackwardFilterAlgorithmEx",
-		"cudnnGetConvolutionBackwardFilterAlgorithm":         "GetConvolutionBackwardFilterAlgorithm",
-		"cudnnGetConvolutionBackwardFilterAlgorithm_v7":      "GetConvolutionBackwardFilterAlgorithm_v7",
-		"cudnnGetConvolutionBackwardFilterWorkspaceSize":     "GetConvolutionBackwardFilterWorkspaceSize",
-		"cudnnConvolutionBackwardFilter":                     "ConvolutionBackwardFilter",
-		"cudnnGetConvolutionBackwardDataAlgorithmMaxCount":   "GetConvolutionBackwardDataAlgorithmMaxCount",
-		"cudnnFindConvolutionBackwardDataAlgorithm":          "FindConvolutionBackwardDataAlgorithm",
-		"cudnnFindConvolutionBackwardDataAlgorithmEx":        "FindConvolutionBackwardDataAlgorithmEx",
-		"cudnnGetConvolutionBackwardDataAlgorithm":           "GetConvolutionBackwardDataAlgorithm",
-		"cudnnGetConvolutionBackwardDataAlgorithm_v7":        "GetConvolutionBackwardDataAlgorithm_v7",
-		"cudnnGetConvolutionBackwardDataWorkspaceSize":       "GetConvolutionBackwardDataWorkspaceSize",
-		"cudnnConvolutionBackwardData":                       "ConvolutionBackwardData",
-		"cudnnIm2Col":                                        "Im2Col",
-		"cudnnSoftmaxForward":                                "SoftmaxForward",
-		"cudnnSoftmaxBackward":                               "SoftmaxBackward",
-		"cudnnCreatePoolingDescriptor":                       "CreatePoolingDescriptor",
-		"cudnnSetPooling2dDescriptor":                        "SetPooling2dDescriptor",
-		"cudnnGetPooling2dDescriptor":                        "GetPooling2dDescriptor",
-		"cudnnSetPoolingNdDescriptor":                        "SetPoolingNdDescriptor",
-		"cudnnGetPoolingNdDescriptor":                        "GetPoolingNdDescriptor",
-		"cudnnGetPoolingNdForwardOutputDim":                  "GetPoolingNdForwardOutputDim",
-		"cudnnGetPooling2dForwardOutputDim":                  "GetPooling2dForwardOutputDim",
-		"cudnnDestroyPoolingDescriptor":                      "DestroyPoolingDescriptor",
-		"cudnnPoolingForward":                                "PoolingForward",
-		"cudnnPoolingBackward":                               "PoolingBackward",
-		"cudnnCreateActivationDescriptor":                    "CreateActivationDescriptor",
-		"cudnnSetActivationDescriptor":                       "SetActivationDescriptor",
-		"cudnnGetActivationDescriptor":                       "GetActivationDescriptor",
-		"cudnnDestroyActivationDescriptor":                   "DestroyActivationDescriptor",
-		"cudnnActivationForward":                             "ActivationForward",
-		"cudnnActivationBackward":                            "ActivationBackward",
-		"cudnnCreateLRNDescriptor":                           "CreateLRNDescriptor",
-		"cudnnSetLRNDescriptor":                              "SetLRNDescriptor",
-		"cudnnGetLRNDescriptor":                              "GetLRNDescriptor",
-		"cudnnDestroyLRNDescriptor":                          "DestroyLRNDescriptor",
-		"cudnnLRNCrossChannelForward":                        "LRNCrossChannelForward",
-		"cudnnLRNCrossChannelBackward":                       "LRNCrossChannelBackward",
-		"cudnnDivisiveNormalizationForward":                  "DivisiveNormalizationForward",
-		"cudnnDivisiveNormalizationBackward":                 "DivisiveNormalizationBackward",
-		"cudnnDeriveBNTensorDescriptor":                      "DeriveBNTensorDescriptor",
-		"cudnnBatchNormalizationForwardTraining":             "BatchNormalizationForwardTraining",
-		"cudnnBatchNormalizationForwardInference":            "BatchNormalizationForwardInference",
-		"cudnnBatchNormalizationBackward":                    "BatchNormalizationBackward",
-		"cudnnCreateSpatialTransformerDescriptor":            "CreateSpatialTransformerDescriptor",
-		"cudnnSetSpatialTransformerNdDescriptor":             "SetSpatialTransformerNdDescriptor",
-		"cudnnDestroySpatialTransformerDescriptor":           "DestroySpatialTransformerDescriptor",
-		"cudnnSpatialTfGridGeneratorForward":                 "SpatialTfGridGeneratorForward",
-		"cudnnSpatialTfGridGeneratorBackward":                "SpatialTfGridGeneratorBackward",
-		"cudnnSpatialTfSamplerForward":                       "SpatialTfSamplerForward",
-		"cudnnSpatialTfSamplerBackward":                      "SpatialTfSamplerBackward",
-		"cudnnCreateDropoutDescriptor":                       "CreateDropoutDescriptor",
-		"cudnnDestroyDropoutDescriptor":                      "DestroyDropoutDescriptor",
-		"cudnnDropoutGetStatesSize":                          "DropoutGetStatesSize",
-		"cudnnDropoutGetReserveSpaceSize":                    "DropoutGetReserveSpaceSize",
-		"cudnnSetDropoutDescriptor":                          "SetDropoutDescriptor",
-		"cudnnRestoreDropoutDescriptor":                      "RestoreDropoutDescriptor",
-		"cudnnGetDropoutDescriptor":                          "GetDropoutDescriptor",
-		"cudnnDropoutForward":                                "DropoutForward",
-		"cudnnDropoutBackward":                               "DropoutBackward",
-		"cudnnCreateRNNDescriptor":                           "CreateRNNDescriptor",
-		"cudnnDestroyRNNDescriptor":                          "DestroyRNNDescriptor",
-		"cudnnCreatePersistentRNNPlan":                       "CreatePersistentRNNPlan",
-		"cudnnSetPersistentRNNPlan":                          "SetPersistentRNNPlan",
-		"cudnnDestroyPersistentRNNPlan":                      "DestroyPersistentRNNPlan",
-		"cudnnSetRNNDescriptor":                              "SetRNNDescriptor",
-		"cudnnGetRNNDescriptor":                              "GetRNNDescriptor",
-		"cudnnSetRNNMatrixMathType":                          "SetRNNMatrixMathType",
-		"cudnnGetRNNWorkspaceSize":                           "GetRNNWorkspaceSize",
-		"cudnnGetRNNTrainingReserveSize":                     "GetRNNTrainingReserveSize",
-		"cudnnGetRNNParamsSize":                              "GetRNNParamsSize",
-		"cudnnGetRNNLinLayerMatrixParams":                    "GetRNNLinLayerMatrixParams",
-		"cudnnGetRNNLinLayerBiasParams":                      "GetRNNLinLayerBiasParams",
-		"cudnnRNNForwardInference":                           "RNNForwardInference",
-		"cudnnRNNForwardTraining":                            "RNNForwardTraining",
-		"cudnnRNNBackwardData":                               "RNNBackwardData",
-		"cudnnRNNBackwardWeights":                            "RNNBackwardWeights",
-		"cudnnCreateCTCLossDescriptor":                       "CreateCTCLossDescriptor",
-		"cudnnSetCTCLossDescriptor":                          "SetCTCLossDescriptor",
-		"cudnnGetCTCLossDescriptor":                          "GetCTCLossDescriptor",
-		"cudnnDestroyCTCLossDescriptor":                      "DestroyCTCLossDescriptor",
-		"cudnnCTCLoss":                                       "CTCLoss",
-		"cudnnGetCTCLossWorkspaceSize":                       "GetCTCLossWorkspaceSize",
-		"cudnnSetRNNDescriptor_v6":                           "SetRNNDescriptor_v6",
-		"cudnnSetRNNDescriptor_v5":                           "SetRNNDescriptor_v5",
+		"cudnnActivationBackward":                                  "ActivationBackward",
+		"cudnnActivationForward":                                   "ActivationForward",
+		"cudnnAddTensor":                                           "AddTensor",
+		"cudnnAdvInferVersionCheck":                                "AdvInferVersionCheck",
+		"cudnnAdvTrainVersionCheck":                                "AdvTrainVersionCheck",
+		"cudnnBackendCreateDescriptor":                             "BackendCreateDescriptor",
+		"cudnnBackendDestroyDescriptor":                            "BackendDestroyDescriptor",
+		"cudnnBackendExecute":                                      "BackendExecute",
+		"cudnnBackendFinalize":                                     "BackendFinalize",
+		"cudnnBackendGetAttribute":                                 "BackendGetAttribute",
+		"cudnnBackendInitialize":                                   "BackendInitialize",
+		"cudnnBackendSetAttribute":                                 "BackendSetAttribute",
+		"cudnnBatchNormalizationBackward":                          "BatchNormalizationBackward",
+		"cudnnBatchNormalizationBackwardEx":                        "BatchNormalizationBackwardEx",
+		"cudnnBatchNormalizationForwardInference":                  "BatchNormalizationForwardInference",
+		"cudnnBatchNormalizationForwardTraining":                   "BatchNormalizationForwardTraining",
+		"cudnnBatchNormalizationForwardTrainingEx":                 "BatchNormalizationForwardTrainingEx",
+		"cudnnBuildRNNDynamic":                                     "BuildRNNDynamic",
+		"cudnnCTCLoss":                                             "CTCLoss",
+		"cudnnCTCLoss_v8":                                          "CTCLoss_v8",
+		"cudnnCnnInferVersionCheck":                                "CnnInferVersionCheck",
+		"cudnnCnnTrainVersionCheck":                                "CnnTrainVersionCheck",
+		"cudnnConvolutionBackwardBias":                             "ConvolutionBackwardBias",
+		"cudnnConvolutionBackwardData":                             "ConvolutionBackwardData",
+		"cudnnConvolutionBackwardFilter":                           "ConvolutionBackwardFilter",
+		"cudnnConvolutionBiasActivationForward":                    "ConvolutionBiasActivationForward",
+		"cudnnConvolutionForward":                                  "ConvolutionForward",
+		"cudnnCopyAlgorithmDescriptor":                             "CopyAlgorithmDescriptor",
+		"cudnnCreate":                                              "Create",
+		"cudnnCreateActivationDescriptor":                          "CreateActivationDescriptor",
+		"cudnnCreateAlgorithmDescriptor":                           "CreateAlgorithmDescriptor",
+		"cudnnCreateAlgorithmPerformance":                          "CreateAlgorithmPerformance",
+		"cudnnCreateAttnDescriptor":                                "CreateAttnDescriptor",
+		"cudnnCreateCTCLossDescriptor":                             "CreateCTCLossDescriptor",
+		"cudnnCreateConvolutionDescriptor":                         "CreateConvolutionDescriptor",
+		"cudnnCreateDropoutDescriptor":                             "CreateDropoutDescriptor",
+		"cudnnCreateFilterDescriptor":                              "CreateFilterDescriptor",
+		"cudnnCreateFusedOpsConstParamPack":                        "CreateFusedOpsConstParamPack",
+		"cudnnCreateFusedOpsPlan":                                  "CreateFusedOpsPlan",
+		"cudnnCreateFusedOpsVariantParamPack":                      "CreateFusedOpsVariantParamPack",
+		"cudnnCreateLRNDescriptor":                                 "CreateLRNDescriptor",
+		"cudnnCreateOpTensorDescriptor":                            "CreateOpTensorDescriptor",
+		"cudnnCreatePersistentRNNPlan":                             "CreatePersistentRNNPlan",
+		"cudnnCreatePoolingDescriptor":                             "CreatePoolingDescriptor",
+		"cudnnCreateRNNDataDescriptor":                             "CreateRNNDataDescriptor",
+		"cudnnCreateRNNDescriptor":                                 "CreateRNNDescriptor",
+		"cudnnCreateReduceTensorDescriptor":                        "CreateReduceTensorDescriptor",
+		"cudnnCreateSeqDataDescriptor":                             "CreateSeqDataDescriptor",
+		"cudnnCreateSpatialTransformerDescriptor":                  "CreateSpatialTransformerDescriptor",
+		"cudnnCreateTensorDescriptor":                              "CreateTensorDescriptor",
+		"cudnnCreateTensorTransformDescriptor":                     "CreateTensorTransformDescriptor",
+		"cudnnDeriveBNTensorDescriptor":                            "DeriveBNTensorDescriptor",
+		"cudnnDeriveNormTensorDescriptor":                          "DeriveNormTensorDescriptor",
+		"cudnnDestroy":                                             "Destroy",
+		"cudnnDestroyActivationDescriptor":                         "DestroyActivationDescriptor",
+		"cudnnDestroyAlgorithmDescriptor":                          "DestroyAlgorithmDescriptor",
+		"cudnnDestroyAlgorithmPerformance":                         "DestroyAlgorithmPerformance",
+		"cudnnDestroyAttnDescriptor":                               "DestroyAttnDescriptor",
+		"cudnnDestroyCTCLossDescriptor":                            "DestroyCTCLossDescriptor",
+		"cudnnDestroyConvolutionDescriptor":                        "DestroyConvolutionDescriptor",
+		"cudnnDestroyDropoutDescriptor":                            "DestroyDropoutDescriptor",
+		"cudnnDestroyFilterDescriptor":                             "DestroyFilterDescriptor",
+		"cudnnDestroyFusedOpsConstParamPack":                       "DestroyFusedOpsConstParamPack",
+		"cudnnDestroyFusedOpsPlan":                                 "DestroyFusedOpsPlan",
+		"cudnnDestroyFusedOpsVariantParamPack":                     "DestroyFusedOpsVariantParamPack",
+		"cudnnDestroyLRNDescriptor":                                "DestroyLRNDescriptor",
+		"cudnnDestroyOpTensorDescriptor":                           "DestroyOpTensorDescriptor",
+		"cudnnDestroyPersistentRNNPlan":                            "DestroyPersistentRNNPlan",
+		"cudnnDestroyPoolingDescriptor":                            "DestroyPoolingDescriptor",
+		"cudnnDestroyRNNDataDescriptor":                            "DestroyRNNDataDescriptor",
+		"cudnnDestroyRNNDescriptor":                                "DestroyRNNDescriptor",
+		"cudnnDestroyReduceTensorDescriptor":                       "DestroyReduceTensorDescriptor",
+		"cudnnDestroySeqDataDescriptor":                            "DestroySeqDataDescriptor",
+		"cudnnDestroySpatialTransformerDescriptor":                 "DestroySpatialTransformerDescriptor",
+		"cudnnDestroyTensorDescriptor":                             "DestroyTensorDescriptor",
+		"cudnnDestroyTensorTransformDescriptor":                    "DestroyTensorTransformDescriptor",
+		"cudnnDivisiveNormalizationBackward":                       "DivisiveNormalizationBackward",
+		"cudnnDivisiveNormalizationForward":                        "DivisiveNormalizationForward",
+		"cudnnDropoutBackward":                                     "DropoutBackward",
+		"cudnnDropoutForward":                                      "DropoutForward",
+		"cudnnDropoutGetReserveSpaceSize":                          "DropoutGetReserveSpaceSize",
+		"cudnnDropoutGetStatesSize":                                "DropoutGetStatesSize",
+		"cudnnFindConvolutionBackwardDataAlgorithm":                "FindConvolutionBackwardDataAlgorithm",
+		"cudnnFindConvolutionBackwardDataAlgorithmEx":              "FindConvolutionBackwardDataAlgorithmEx",
+		"cudnnFindConvolutionBackwardFilterAlgorithm":              "FindConvolutionBackwardFilterAlgorithm",
+		"cudnnFindConvolutionBackwardFilterAlgorithmEx":            "FindConvolutionBackwardFilterAlgorithmEx",
+		"cudnnFindConvolutionForwardAlgorithm":                     "FindConvolutionForwardAlgorithm",
+		"cudnnFindConvolutionForwardAlgorithmEx":                   "FindConvolutionForwardAlgorithmEx",
+		"cudnnFindRNNBackwardDataAlgorithmEx":                      "FindRNNBackwardDataAlgorithmEx",
+		"cudnnFindRNNBackwardWeightsAlgorithmEx":                   "FindRNNBackwardWeightsAlgorithmEx",
+		"cudnnFindRNNForwardInferenceAlgorithmEx":                  "FindRNNForwardInferenceAlgorithmEx",
+		"cudnnFindRNNForwardTrainingAlgorithmEx":                   "FindRNNForwardTrainingAlgorithmEx",
+		"cudnnFusedOpsExecute":                                     "FusedOpsExecute",
+		"cudnnGetActivationDescriptor":                             "GetActivationDescriptor",
+		"cudnnGetAlgorithmDescriptor":                              "GetAlgorithmDescriptor",
+		"cudnnGetAlgorithmPerformance":                             "GetAlgorithmPerformance",
+		"cudnnGetAlgorithmSpaceSize":                               "GetAlgorithmSpaceSize",
+		"cudnnGetAttnDescriptor":                                   "GetAttnDescriptor",
+		"cudnnGetBatchNormalizationBackwardExWorkspaceSize":        "GetBatchNormalizationBackwardExWorkspaceSize",
+		"cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize": "GetBatchNormalizationForwardTrainingExWorkspaceSize",
+		"cudnnGetBatchNormalizationTrainingExReserveSpaceSize":     "GetBatchNormalizationTrainingExReserveSpaceSize",
+		"cudnnGetCTCLossDescriptor":                                "GetCTCLossDescriptor",
+		"cudnnGetCTCLossDescriptorEx":                              "GetCTCLossDescriptorEx",
+		"cudnnGetCTCLossDescriptor_v8":                             "GetCTCLossDescriptor_v8",
+		"cudnnGetCTCLossWorkspaceSize":                             "GetCTCLossWorkspaceSize",
+		"cudnnGetCTCLossWorkspaceSize_v8":                          "GetCTCLossWorkspaceSize_v8",
+		"cudnnGetCallback":                                         "GetCallback",
+		"cudnnGetConvolution2dDescriptor":                          "GetConvolution2dDescriptor",
+		"cudnnGetConvolution2dForwardOutputDim":                    "GetConvolution2dForwardOutputDim",
+		"cudnnGetConvolutionBackwardDataAlgorithmMaxCount":         "GetConvolutionBackwardDataAlgorithmMaxCount",
+		"cudnnGetConvolutionBackwardDataAlgorithm_v7":              "GetConvolutionBackwardDataAlgorithm_v7",
+		"cudnnGetConvolutionBackwardDataWorkspaceSize":             "GetConvolutionBackwardDataWorkspaceSize",
+		"cudnnGetConvolutionBackwardFilterAlgorithmMaxCount":       "GetConvolutionBackwardFilterAlgorithmMaxCount",
+		"cudnnGetConvolutionBackwardFilterAlgorithm_v7":            "GetConvolutionBackwardFilterAlgorithm_v7",
+		"cudnnGetConvolutionBackwardFilterWorkspaceSize":           "GetConvolutionBackwardFilterWorkspaceSize",
+		"cudnnGetConvolutionForwardAlgorithmMaxCount":              "GetConvolutionForwardAlgorithmMaxCount",
+		"cudnnGetConvolutionForwardAlgorithm_v7":                   "GetConvolutionForwardAlgorithm_v7",
+		"cudnnGetConvolutionForwardWorkspaceSize":                  "GetConvolutionForwardWorkspaceSize",
+		"cudnnGetConvolutionGroupCount":                            "GetConvolutionGroupCount",
+		"cudnnGetConvolutionMathType":                              "GetConvolutionMathType",
+		"cudnnGetConvolutionNdDescriptor":                          "GetConvolutionNdDescriptor",
+		"cudnnGetConvolutionNdForwardOutputDim":                    "GetConvolutionNdForwardOutputDim",
+		"cudnnGetConvolutionReorderType":                           "GetConvolutionReorderType",
+		"cudnnGetCudartVersion":                                    "GetCudartVersion",
+		"cudnnGetDropoutDescriptor":                                "GetDropoutDescriptor",
+		"cudnnGetErrorString":                                      "GetErrorString",
+		"cudnnGetFilter4dDescriptor":                               "GetFilter4dDescriptor",
+		"cudnnGetFilterNdDescriptor":                               "GetFilterNdDescriptor",
+		"cudnnGetFilterSizeInBytes":                                "GetFilterSizeInBytes",
+		"cudnnGetFoldedConvBackwardDataDescriptors":                "GetFoldedConvBackwardDataDescriptors",
+		"cudnnGetFusedOpsConstParamPackAttribute":                  "GetFusedOpsConstParamPackAttribute",
+		"cudnnGetFusedOpsVariantParamPackAttribute":                "GetFusedOpsVariantParamPackAttribute",
+		"cudnnGetLRNDescriptor":                                    "GetLRNDescriptor",
+		"cudnnGetMultiHeadAttnBuffers":                             "GetMultiHeadAttnBuffers",
+		"cudnnGetMultiHeadAttnWeights":                             "GetMultiHeadAttnWeights",
+		"cudnnGetNormalizationBackwardWorkspaceSize":               "GetNormalizationBackwardWorkspaceSize",
+		"cudnnGetNormalizationForwardTrainingWorkspaceSize":        "GetNormalizationForwardTrainingWorkspaceSize",
+		"cudnnGetNormalizationTrainingReserveSpaceSize":            "GetNormalizationTrainingReserveSpaceSize",
+		"cudnnGetOpTensorDescriptor":                               "GetOpTensorDescriptor",
+		"cudnnGetPooling2dDescriptor":                              "GetPooling2dDescriptor",
+		"cudnnGetPooling2dForwardOutputDim":                        "GetPooling2dForwardOutputDim",
+		"cudnnGetPoolingNdDescriptor":                              "GetPoolingNdDescriptor",
+		"cudnnGetPoolingNdForwardOutputDim":                        "GetPoolingNdForwardOutputDim",
+		"cudnnGetProperty":                                         "GetProperty",
+		"cudnnGetRNNBackwardDataAlgorithmMaxCount":                 "GetRNNBackwardDataAlgorithmMaxCount",
+		"cudnnGetRNNBackwardWeightsAlgorithmMaxCount":              "GetRNNBackwardWeightsAlgorithmMaxCount",
+		"cudnnGetRNNBiasMode":                                      "GetRNNBiasMode",
+		"cudnnGetRNNDataDescriptor":                                "GetRNNDataDescriptor",
+		"cudnnGetRNNDescriptor_v6":                                 "GetRNNDescriptor_v6",
+		"cudnnGetRNNDescriptor_v8":                                 "GetRNNDescriptor_v8",
+		"cudnnGetRNNForwardInferenceAlgorithmMaxCount":             "GetRNNForwardInferenceAlgorithmMaxCount",
+		"cudnnGetRNNForwardTrainingAlgorithmMaxCount":              "GetRNNForwardTrainingAlgorithmMaxCount",
+		"cudnnGetRNNLinLayerBiasParams":                            "GetRNNLinLayerBiasParams",
+		"cudnnGetRNNLinLayerMatrixParams":                          "GetRNNLinLayerMatrixParams",
+		"cudnnGetRNNMatrixMathType":                                "GetRNNMatrixMathType",
+		"cudnnGetRNNPaddingMode":                                   "GetRNNPaddingMode",
+		"cudnnGetRNNParamsSize":                                    "GetRNNParamsSize",
+		"cudnnGetRNNProjectionLayers":                              "GetRNNProjectionLayers",
+		"cudnnGetRNNTempSpaceSizes":                                "GetRNNTempSpaceSizes",
+		"cudnnGetRNNTrainingReserveSize":                           "GetRNNTrainingReserveSize",
+		"cudnnGetRNNWeightParams":                                  "GetRNNWeightParams",
+		"cudnnGetRNNWeightSpaceSize":                               "GetRNNWeightSpaceSize",
+		"cudnnGetRNNWorkspaceSize":                                 "GetRNNWorkspaceSize",
+		"cudnnGetReduceTensorDescriptor":                           "GetReduceTensorDescriptor",
+		"cudnnGetReductionIndicesSize":                             "GetReductionIndicesSize",
+		"cudnnGetReductionWorkspaceSize":                           "GetReductionWorkspaceSize",
+		"cudnnGetSeqDataDescriptor":                                "GetSeqDataDescriptor",
+		"cudnnGetStream":                                           "GetStream",
+		"cudnnGetTensor4dDescriptor":                               "GetTensor4dDescriptor",
+		"cudnnGetTensorNdDescriptor":                               "GetTensorNdDescriptor",
+		"cudnnGetTensorSizeInBytes":                                "GetTensorSizeInBytes",
+		"cudnnGetTensorTransformDescriptor":                        "GetTensorTransformDescriptor",
+		"cudnnGetVersion":                                          "GetVersion",
+		"cudnnIm2Col":                                              "Im2Col",
+		"cudnnInitTransformDest":                                   "InitTransformDest",
+		"cudnnLRNCrossChannelBackward":                             "LRNCrossChannelBackward",
+		"cudnnLRNCrossChannelForward":                              "LRNCrossChannelForward",
+		"cudnnMakeFusedOpsPlan":                                    "MakeFusedOpsPlan",
+		"cudnnMultiHeadAttnBackwardData":                           "MultiHeadAttnBackwardData",
+		"cudnnMultiHeadAttnBackwardWeights":                        "MultiHeadAttnBackwardWeights",
+		"cudnnMultiHeadAttnForward":                                "MultiHeadAttnForward",
+		"cudnnNormalizationBackward":                               "NormalizationBackward",
+		"cudnnNormalizationForwardInference":                       "NormalizationForwardInference",
+		"cudnnNormalizationForwardTraining":                        "NormalizationForwardTraining",
+		"cudnnOpTensor":                                            "OpTensor",
+		"cudnnOpsInferVersionCheck":                                "OpsInferVersionCheck",
+		"cudnnOpsTrainVersionCheck":                                "OpsTrainVersionCheck",
+		"cudnnPoolingBackward":                                     "PoolingBackward",
+		"cudnnPoolingForward":                                      "PoolingForward",
+		"cudnnQueryRuntimeError":                                   "QueryRuntimeError",
+		"cudnnRNNBackwardData":                                     "RNNBackwardData",
+		"cudnnRNNBackwardDataEx":                                   "RNNBackwardDataEx",
+		"cudnnRNNBackwardData_v8":                                  "RNNBackwardData_v8",
+		"cudnnRNNBackwardWeights":                                  "RNNBackwardWeights",
+		"cudnnRNNBackwardWeightsEx":                                "RNNBackwardWeightsEx",
+		"cudnnRNNBackwardWeights_v8":                               "RNNBackwardWeights_v8",
+		"cudnnRNNForward":                                          "RNNForward",
+		"cudnnRNNForwardInference":                                 "RNNForwardInference",
+		"cudnnRNNForwardInferenceEx":                               "RNNForwardInferenceEx",
+		"cudnnRNNForwardTraining":                                  "RNNForwardTraining",
+		"cudnnRNNForwardTrainingEx":                                "RNNForwardTrainingEx",
+		"cudnnRNNGetClip":                                          "RNNGetClip",
+		"cudnnRNNGetClip_v8":                                       "RNNGetClip_v8",
+		"cudnnRNNSetClip":                                          "RNNSetClip",
+		"cudnnRNNSetClip_v8":                                       "RNNSetClip_v8",
+		"cudnnReduceTensor":                                        "ReduceTensor",
+		"cudnnReorderFilterAndBias":                                "ReorderFilterAndBias",
+		"cudnnRestoreAlgorithm":                                    "RestoreAlgorithm",
+		"cudnnRestoreDropoutDescriptor":                            "RestoreDropoutDescriptor",
+		"cudnnSaveAlgorithm":                                       "SaveAlgorithm",
+		"cudnnScaleTensor":                                         "ScaleTensor",
+		"cudnnSetActivationDescriptor":                             "SetActivationDescriptor",
+		"cudnnSetAlgorithmDescriptor":                              "SetAlgorithmDescriptor",
+		"cudnnSetAlgorithmPerformance":                             "SetAlgorithmPerformance",
+		"cudnnSetAttnDescriptor":                                   "SetAttnDescriptor",
+		"cudnnSetCTCLossDescriptor":                                "SetCTCLossDescriptor",
+		"cudnnSetCTCLossDescriptorEx":                              "SetCTCLossDescriptorEx",
+		"cudnnSetCTCLossDescriptor_v8":                             "SetCTCLossDescriptor_v8",
+		"cudnnSetCallback":                                         "SetCallback",
+		"cudnnSetConvolution2dDescriptor":                          "SetConvolution2dDescriptor",
+		"cudnnSetConvolutionGroupCount":                            "SetConvolutionGroupCount",
+		"cudnnSetConvolutionMathType":                              "SetConvolutionMathType",
+		"cudnnSetConvolutionNdDescriptor":                          "SetConvolutionNdDescriptor",
+		"cudnnSetConvolutionReorderType":                           "SetConvolutionReorderType",
+		"cudnnSetDropoutDescriptor":                                "SetDropoutDescriptor",
+		"cudnnSetFilter4dDescriptor":                               "SetFilter4dDescriptor",
+		"cudnnSetFilterNdDescriptor":                               "SetFilterNdDescriptor",
+		"cudnnSetFusedOpsConstParamPackAttribute":                  "SetFusedOpsConstParamPackAttribute",
+		"cudnnSetFusedOpsVariantParamPackAttribute":                "SetFusedOpsVariantParamPackAttribute",
+		"cudnnSetLRNDescriptor":                                    "SetLRNDescriptor",
+		"cudnnSetOpTensorDescriptor":                               "SetOpTensorDescriptor",
+		"cudnnSetPersistentRNNPlan":                                "SetPersistentRNNPlan",
+		"cudnnSetPooling2dDescriptor":                              "SetPooling2dDescriptor",
+		"cudnnSetPoolingNdDescriptor":                              "SetPoolingNdDescriptor",
+		"cudnnSetRNNAlgorithmDescriptor":                           "SetRNNAlgorithmDescriptor",
+		"cudnnSetRNNBiasMode":                                      "SetRNNBiasMode",
+		"cudnnSetRNNDataDescriptor":                                "SetRNNDataDescriptor",
+		"cudnnSetRNNDescriptor_v6":                                 "SetRNNDescriptor_v6",
+		"cudnnSetRNNDescriptor_v8":                                 "SetRNNDescriptor_v8",
+		"cudnnSetRNNMatrixMathType":                                "SetRNNMatrixMathType",
+		"cudnnSetRNNPaddingMode":                                   "SetRNNPaddingMode",
+		"cudnnSetRNNProjectionLayers":                              "SetRNNProjectionLayers",
+		"cudnnSetReduceTensorDescriptor":                           "SetReduceTensorDescriptor",
+		"cudnnSetSeqDataDescriptor":                                "SetSeqDataDescriptor",
+		"cudnnSetSpatialTransformerNdDescriptor":                   "SetSpatialTransformerNdDescriptor",
+		"cudnnSetStream":                                           "SetStream",
+		"cudnnSetTensor":                                           "SetTensor",
+		"cudnnSetTensor4dDescriptor":                               "SetTensor4dDescriptor",
+		"cudnnSetTensor4dDescriptorEx":                             "SetTensor4dDescriptorEx",
+		"cudnnSetTensorNdDescriptor":                               "SetTensorNdDescriptor",
+		"cudnnSetTensorNdDescriptorEx":                             "SetTensorNdDescriptorEx",
+		"cudnnSetTensorTransformDescriptor":                        "SetTensorTransformDescriptor",
+		"cudnnSoftmaxBackward":                                     "SoftmaxBackward",
+		"cudnnSoftmaxForward":                                      "SoftmaxForward",
+		"cudnnSpatialTfGridGeneratorBackward":                      "SpatialTfGridGeneratorBackward",
+		"cudnnSpatialTfGridGeneratorForward":                       "SpatialTfGridGeneratorForward",
+		"cudnnSpatialTfSamplerBackward":                            "SpatialTfSamplerBackward",
+		"cudnnSpatialTfSamplerForward":                             "SpatialTfSamplerForward",
+		"cudnnTransformFilter":                                     "TransformFilter",
+		"cudnnTransformTensor":                                     "TransformTensor",
+		"cudnnTransformTensorEx":                                   "TransformTensorEx",
 	}
 	enumMappings = map[string]string{
-		"cudnnStatus_t":              "Status",
-		"cudnnErrQueryMode_t":        "ErrQueryMode",
-		"cudnnDataType_t":            "DataType",
-		"cudnnMathType_t":            "MathType",
-		"cudnnNanPropagation_t":      "NanPropagation",
-		"cudnnDeterminism_t":         "Determinism",
-		"cudnnTensorFormat_t":        "TensorFormat",
-		"cudnnOpTensorOp_t":          "OpTensorOp",
-		"cudnnReduceTensorOp_t":      "ReduceTensorOp",
-		"cudnnReduceTensorIndices_t": "ReduceTensorIndices",
-		"cudnnIndicesType_t":         "IndicesType",
-		"cudnnConvolutionMode_t":     "ConvolutionMode",
-		// "cudnnConvolutionFwdPreference_t": "ConvolutionFwdPreference",
-		"cudnnConvolutionFwdPreference_t": "ConvolutionPreference",
-		"cudnnConvolutionFwdAlgo_t":       "ConvolutionFwdAlgo",
-		// "cudnnConvolutionBwdFilterPreference_t": "ConvolutionBwdFilterPreference",
-		"cudnnConvolutionBwdFilterPreference_t": "ConvolutionPreference",
-		"cudnnConvolutionBwdFilterAlgo_t":       "ConvolutionBwdFilterAlgo",
-		// "cudnnConvolutionBwdDataPreference_t":   "ConvolutionBwdDataPreference",
-		"cudnnConvolutionBwdDataPreference_t": "ConvolutionPreference",
-		"cudnnConvolutionBwdDataAlgo_t":       "ConvolutionBwdDataAlgo",
-		"cudnnSoftmaxAlgorithm_t":             "SoftmaxAlgorithm",
-		"cudnnSoftmaxMode_t":                  "SoftmaxMode",
-		"cudnnPoolingMode_t":                  "PoolingMode",
-		"cudnnActivationMode_t":               "ActivationMode",
-		"cudnnLRNMode_t":                      "LRNMode",
-		"cudnnDivNormMode_t":                  "DivNormMode",
-		"cudnnBatchNormMode_t":                "BatchNormMode",
-		"cudnnSamplerType_t":                  "SamplerType",
-		"cudnnRNNMode_t":                      "RNNMode",
-		"cudnnDirectionMode_t":                "DirectionMode",
-		"cudnnRNNInputMode_t":                 "RNNInputMode",
-		"cudnnRNNAlgo_t":                      "RNNAlgo",
-		"cudnnCTCLossAlgo_t":                  "CTCLossAlgo",
+		"cudnnActivationMode_t":             "ActivationMode",
+		"cudnnBackendAttributeName_t":       "BackendAttributeName",
+		"cudnnBackendAttributeType_t":       "BackendAttributeType",
+		"cudnnBackendDescriptorType_t":      "BackendDescriptorType",
+		"cudnnBackendHeurMode_t":            "BackendHeurMode",
+		"cudnnBackendKnobType_t":            "BackendKnobType",
+		"cudnnBackendLayoutType_t":          "BackendLayoutType",
+		"cudnnBackendNumericalNote_t":       "BackendNumericalNote",
+		"cudnnBatchNormMode_t":              "BatchNormMode",
+		"cudnnBatchNormOps_t":               "BatchNormOps",
+		"cudnnCTCLossAlgo_t":                "CTCLossAlgo",
+		"cudnnConvolutionBwdDataAlgo_t":     "ConvolutionBwdDataAlgo",
+		"cudnnConvolutionBwdFilterAlgo_t":   "ConvolutionBwdFilterAlgo",
+		"cudnnConvolutionFwdAlgo_t":         "ConvolutionFwdAlgo",
+		"cudnnConvolutionMode_t":            "ConvolutionMode",
+		"cudnnDataType_t":                   "DataType",
+		"cudnnDeterminism_t":                "Determinism",
+		"cudnnDirectionMode_t":              "DirectionMode",
+		"cudnnDivNormMode_t":                "DivNormMode",
+		"cudnnErrQueryMode_t":               "ErrQueryMode",
+		"cudnnFoldingDirection_t":           "FoldingDirection",
+		"cudnnForwardMode_t":                "ForwardMode",
+		"cudnnFusedOpsConstParamLabel_t":    "FusedOpsConstParamLabel",
+		"cudnnFusedOpsPointerPlaceHolder_t": "FusedOpsPointerPlaceHolder",
+		"cudnnFusedOpsVariantParamLabel_t":  "FusedOpsVariantParamLabel",
+		"cudnnFusedOps_t":                   "FusedOps",
+		"cudnnGenStatsMode_t":               "GenStatsMode",
+		"cudnnIndicesType_t":                "IndicesType",
+		"cudnnLRNMode_t":                    "LRNMode",
+		"cudnnLossNormalizationMode_t":      "LossNormalizationMode",
+		"cudnnMathType_t":                   "MathType",
+		"cudnnMultiHeadAttnWeightKind_t":    "MultiHeadAttnWeightKind",
+		"cudnnNanPropagation_t":             "NanPropagation",
+		"cudnnNormAlgo_t":                   "NormAlgo",
+		"cudnnNormMode_t":                   "NormMode",
+		"cudnnNormOps_t":                    "NormOps",
+		"cudnnOpTensorOp_t":                 "OpTensorOp",
+		"cudnnPointwiseMode_t":              "PointwiseMode",
+		"cudnnPoolingMode_t":                "PoolingMode",
+		"cudnnRNNAlgo_t":                    "RNNAlgo",
+		"cudnnRNNBiasMode_t":                "RNNBiasMode",
+		"cudnnRNNClipMode_t":                "RNNClipMode",
+		"cudnnRNNDataLayout_t":              "RNNDataLayout",
+		"cudnnRNNInputMode_t":               "RNNInputMode",
+		"cudnnRNNMode_t":                    "RNNMode",
+		"cudnnReduceTensorIndices_t":        "ReduceTensorIndices",
+		"cudnnReduceTensorOp_t":             "ReduceTensorOp",
+		"cudnnReorderType_t":                "ReorderType",
+		"cudnnSamplerType_t":                "SamplerType",
+		"cudnnSeqDataAxis_t":                "SeqDataAxis",
+		"cudnnSeverity_t":                   "Severity",
+		"cudnnSoftmaxAlgorithm_t":           "SoftmaxAlgorithm",
+		"cudnnSoftmaxMode_t":                "SoftmaxMode",
+		"cudnnStatus_t":                     "Status",
+		"cudnnTensorFormat_t":               "TensorFormat",
+		"cudnnWgradMode_t":                  "WgradMode",
 	}
-
 	alphaBetas = map[string]map[int]string{
-		"cudnnTransformTensor":                    {4: "beta", 1: "alpha"},
-		"cudnnAddTensor":                          {4: "beta", 1: "alpha"},
-		"cudnnOpTensor":                           {8: "beta", 5: "alpha2", 2: "alpha1"},
-		"cudnnReduceTensor":                       {9: "beta", 6: "alpha"},
-		"cudnnScaleTensor":                        {3: "alpha"},
-		"cudnnConvolutionForward":                 {10: "beta", 1: "alpha"},
-		"cudnnConvolutionBiasActivationForward":   {10: "alpha2", 1: "alpha1"},
-		"cudnnConvolutionBackwardBias":            {4: "beta", 1: "alpha"},
-		"cudnnConvolutionBackwardFilter":          {10: "beta", 1: "alpha"},
-		"cudnnConvolutionBackwardData":            {10: "beta", 1: "alpha"},
-		"cudnnSoftmaxForward":                     {6: "beta", 3: "alpha"},
-		"cudnnSoftmaxBackward":                    {8: "beta", 3: "alpha"},
-		"cudnnPoolingForward":                     {5: "beta", 2: "alpha"},
-		"cudnnPoolingBackward":                    {9: "beta", 2: "alpha"},
-		"cudnnActivationForward":                  {5: "beta", 2: "alpha"},
-		"cudnnActivationBackward":                 {9: "beta", 2: "alpha"},
-		"cudnnLRNCrossChannelForward":             {6: "beta", 3: "alpha"},
-		"cudnnLRNCrossChannelBackward":            {10: "beta", 3: "alpha"},
-		"cudnnDivisiveNormalizationForward":       {9: "beta", 3: "alpha"},
-		"cudnnDivisiveNormalizationBackward":      {10: "beta", 3: "alpha"},
-		"cudnnBatchNormalizationForwardTraining":  {3: "beta", 2: "alpha"},
-		"cudnnBatchNormalizationForwardInference": {3: "beta", 2: "alpha"},
-		"cudnnBatchNormalizationBackward":         {5: "betaParamDiff", 4: "alphaParamDiff", 3: "betaDataDiff", 2: "alphaDataDiff"},
-		"cudnnSpatialTfSamplerForward":            {6: "beta", 2: "alpha"},
-		"cudnnSpatialTfSamplerBackward":           {5: "beta", 2: "alpha"},
+		"cudnnActivationBackward":                  {9: "beta", 2: "alpha"},
+		"cudnnActivationForward":                   {5: "beta", 2: "alpha"},
+		"cudnnAddTensor":                           {4: "beta", 1: "alpha"},
+		"cudnnBatchNormalizationBackward":          {5: "betaParamDiff", 4: "alphaParamDiff", 3: "betaDataDiff", 2: "alphaDataDiff"},
+		"cudnnBatchNormalizationBackwardEx":        {6: "betaParamDiff", 5: "alphaParamDiff", 4: "betaDataDiff", 3: "alphaDataDiff"},
+		"cudnnBatchNormalizationForwardInference":  {3: "beta", 2: "alpha"},
+		"cudnnBatchNormalizationForwardTraining":   {3: "beta", 2: "alpha"},
+		"cudnnBatchNormalizationForwardTrainingEx": {4: "beta", 3: "alpha"},
+		"cudnnConvolutionBackwardBias":             {4: "beta", 1: "alpha"},
+		"cudnnConvolutionBackwardData":             {10: "beta", 1: "alpha"},
+		"cudnnConvolutionBackwardFilter":           {10: "beta", 1: "alpha"},
+		"cudnnConvolutionBiasActivationForward":    {10: "alpha2", 1: "alpha1"},
+		"cudnnConvolutionForward":                  {10: "beta", 1: "alpha"},
+		"cudnnDivisiveNormalizationBackward":       {10: "beta", 3: "alpha"},
+		"cudnnDivisiveNormalizationForward":        {9: "beta", 3: "alpha"},
+		"cudnnLRNCrossChannelBackward":             {10: "beta", 3: "alpha"},
+		"cudnnLRNCrossChannelForward":              {6: "beta", 3: "alpha"},
+		"cudnnNormalizationBackward":               {7: "betaParamDiff", 6: "alphaParamDiff", 5: "betaDataDiff", 4: "alphaDataDiff"},
+		"cudnnNormalizationForwardInference":       {5: "beta", 4: "alpha"},
+		"cudnnNormalizationForwardTraining":        {5: "beta", 4: "alpha"},
+		"cudnnOpTensor":                            {8: "beta", 5: "alpha2", 2: "alpha1"},
+		"cudnnPoolingBackward":                     {9: "beta", 2: "alpha"},
+		"cudnnPoolingForward":                      {5: "beta", 2: "alpha"},
+		"cudnnReduceTensor":                        {9: "beta", 6: "alpha"},
+		"cudnnScaleTensor":                         {3: "alpha"},
+		"cudnnSoftmaxBackward":                     {8: "beta", 3: "alpha"},
+		"cudnnSoftmaxForward":                      {6: "beta", 3: "alpha"},
+		"cudnnSpatialTfSamplerBackward":            {5: "beta", 2: "alpha"},
+		"cudnnSpatialTfSamplerForward":             {6: "beta", 2: "alpha"},
+		"cudnnTransformFilter":                     {5: "beta", 2: "alpha"},
+		"cudnnTransformTensor":                     {4: "beta", 1: "alpha"},
+		"cudnnTransformTensorEx":                   {5: "beta", 2: "alpha"},
 	}
-
 	creations = map[string][]string{
-		"cudnnConvolutionDescriptor_t":        {"cudnnCreateConvolutionDescriptor"},
-		"cudnnPersistentRNNPlan_t":            {"cudnnCreatePersistentRNNPlan"},
-		"cudnnLRNDescriptor_t":                {"cudnnCreateLRNDescriptor"},
-		"cudnnTensorDescriptor_t":             {"cudnnCreateTensorDescriptor"},
-		"cudnnFilterDescriptor_t":             {"cudnnCreateFilterDescriptor"},
-		"cudnnPoolingDescriptor_t":            {"cudnnCreatePoolingDescriptor"},
 		"cudnnActivationDescriptor_t":         {"cudnnCreateActivationDescriptor"},
+		"cudnnAlgorithmDescriptor_t":          {"cudnnCreateAlgorithmDescriptor"},
+		"cudnnAlgorithmPerformance_t":         {"cudnnCreateAlgorithmPerformance"},
+		"cudnnAttnDescriptor_t":               {"cudnnCreateAttnDescriptor"},
+		"cudnnBackendDescriptor_t":            {"cudnnBackendCreateDescriptor"},
+		"cudnnConvolutionDescriptor_t":        {"cudnnCreateConvolutionDescriptor"},
 		"cudnnDropoutDescriptor_t":            {"cudnnCreateDropoutDescriptor"},
-		"cudnnRNNDescriptor_t":                {"cudnnCreateRNNDescriptor"},
-		"cudnnCTCLossDescriptor_t":            {"cudnnCreateCTCLossDescriptor"},
+		"cudnnFilterDescriptor_t":             {"cudnnCreateFilterDescriptor"},
+		"cudnnFusedOpsConstParamPack_t":       {"cudnnCreateFusedOpsConstParamPack"},
+		"cudnnFusedOpsPlan_t":                 {"cudnnCreateFusedOpsPlan"},
+		"cudnnFusedOpsVariantParamPack_t":     {"cudnnCreateFusedOpsVariantParamPack"},
 		"cudnnHandle_t":                       {"cudnnCreate"},
+		"cudnnLRNDescriptor_t":                {"cudnnCreateLRNDescriptor"},
 		"cudnnOpTensorDescriptor_t":           {"cudnnCreateOpTensorDescriptor"},
+		"cudnnPersistentRNNPlan_t":            {"cudnnCreatePersistentRNNPlan"},
+		"cudnnPoolingDescriptor_t":            {"cudnnCreatePoolingDescriptor"},
+		"cudnnRNNDataDescriptor_t":            {"cudnnCreateRNNDataDescriptor"},
+		"cudnnRNNDescriptor_t":                {"cudnnCreateRNNDescriptor"},
 		"cudnnReduceTensorDescriptor_t":       {"cudnnCreateReduceTensorDescriptor"},
+		"cudnnSeqDataDescriptor_t":            {"cudnnCreateSeqDataDescriptor"},
 		"cudnnSpatialTransformerDescriptor_t": {"cudnnCreateSpatialTransformerDescriptor"},
+		"cudnnTensorDescriptor_t":             {"cudnnCreateTensorDescriptor"},
+		"cudnnTensorTransformDescriptor_t":    {"cudnnCreateTensorTransformDescriptor"},
+		"cudnnCTCLossDescriptor_t":            {"cudnnCreateCTCLossDescriptor"},
 	}
 
 	setFns = map[string][]string{
-		"cudnnOpTensorDescriptor_t":           {"cudnnSetOpTensorDescriptor"},
-		"cudnnPoolingDescriptor_t":            {"cudnnSetPooling2dDescriptor", "cudnnSetPoolingNdDescriptor"},
+		"cudaStream_t":                        {"cudnnSetStream"},
 		"cudnnActivationDescriptor_t":         {"cudnnSetActivationDescriptor"},
+		"cudnnAlgorithmDescriptor_t":          {"cudnnSetAlgorithmDescriptor"},
+		"cudnnAlgorithmPerformance_t":         {"cudnnSetAlgorithmPerformance"},
+		"cudnnAttnDescriptor_t":               {"cudnnSetAttnDescriptor"},
+		"cudnnBackendDescriptor_t":            {"cudnnBackendSetAttribute"},
+		"cudnnCTCLossDescriptor_t":            {"cudnnSetCTCLossDescriptor", "cudnnSetCTCLossDescriptorEx", "cudnnSetCTCLossDescriptor_v8"},
+		"cudnnConvolutionDescriptor_t":        {"cudnnSetConvolution2dDescriptor", "cudnnSetConvolutionGroupCount", "cudnnSetConvolutionMathType", "cudnnSetConvolutionNdDescriptor", "cudnnSetConvolutionReorderType"},
 		"cudnnDropoutDescriptor_t":            {"cudnnSetDropoutDescriptor"},
-		"cudnnRNNDescriptor_t":                {"cudnnSetPersistentRNNPlan", "cudnnSetRNNDescriptor", "cudnnSetRNNMatrixMathType", "cudnnSetRNNDescriptor_v6", "cudnnSetRNNDescriptor_v5"},
-		"cudnnCTCLossDescriptor_t":            {"cudnnSetCTCLossDescriptor"},
-		"cudaStream_t":                        {"cudnnSetStream"},
-		"cudnnTensorDescriptor_t":             {"cudnnSetTensor4dDescriptor", "cudnnSetTensor4dDescriptorEx", "cudnnSetTensorNdDescriptor", "cudnnSetTensorNdDescriptorEx", "cudnnSetTensor"},
-		"cudnnReduceTensorDescriptor_t":       {"cudnnSetReduceTensorDescriptor"},
 		"cudnnFilterDescriptor_t":             {"cudnnSetFilter4dDescriptor", "cudnnSetFilterNdDescriptor"},
-		"cudnnConvolutionDescriptor_t":        {"cudnnSetConvolutionMathType", "cudnnSetConvolutionGroupCount", "cudnnSetConvolution2dDescriptor", "cudnnSetConvolutionNdDescriptor"},
+		"cudnnFusedOpsConstParamPack_t":       {"cudnnSetFusedOpsConstParamPackAttribute"},
+		"cudnnFusedOpsVariantParamPack_t":     {"cudnnSetFusedOpsVariantParamPackAttribute"},
 		"cudnnLRNDescriptor_t":                {"cudnnSetLRNDescriptor"},
+		"cudnnOpTensorDescriptor_t":           {"cudnnSetOpTensorDescriptor"},
+		"cudnnPoolingDescriptor_t":            {"cudnnSetPooling2dDescriptor", "cudnnSetPoolingNdDescriptor"},
+		"cudnnRNNDataDescriptor_t":            {"cudnnSetRNNDataDescriptor"},
+		"cudnnRNNDescriptor_t":                {"cudnnRNNSetClip", "cudnnRNNSetClip_v8", "cudnnSetPersistentRNNPlan", "cudnnSetRNNAlgorithmDescriptor", "cudnnSetRNNBiasMode", "cudnnSetRNNDescriptor_v6", "cudnnSetRNNDescriptor_v8", "cudnnSetRNNMatrixMathType", "cudnnSetRNNPaddingMode", "cudnnSetRNNProjectionLayers"},
+		"cudnnReduceTensorDescriptor_t":       {"cudnnSetReduceTensorDescriptor"},
+		"cudnnSeqDataDescriptor_t":            {"cudnnSetSeqDataDescriptor"},
 		"cudnnSpatialTransformerDescriptor_t": {"cudnnSetSpatialTransformerNdDescriptor"},
+		"cudnnTensorDescriptor_t":             {"cudnnSetTensor", "cudnnSetTensor4dDescriptor", "cudnnSetTensor4dDescriptorEx", "cudnnSetTensorNdDescriptor", "cudnnSetTensorNdDescriptorEx"},
+		"cudnnTensorTransformDescriptor_t":    {"cudnnSetTensorTransformDescriptor"},
+		"unsigned":                            {"cudnnSetCallback"},
 	}
 
 	destructions = map[string][]string{
-		"cudnnReduceTensorDescriptor_t":       {"cudnnDestroyReduceTensorDescriptor"},
-		"cudnnPoolingDescriptor_t":            {"cudnnDestroyPoolingDescriptor"},
-		"cudnnSpatialTransformerDescriptor_t": {"cudnnDestroySpatialTransformerDescriptor"},
+		"cudnnActivationDescriptor_t":         {"cudnnDestroyActivationDescriptor"},
+		"cudnnAlgorithmPerformance_t":         {"cudnnDestroyAlgorithmPerformance"},
+		"cudnnAttnDescriptor_t":               {"cudnnDestroyAttnDescriptor"},
+		"cudnnBackendDescriptor_t":            {"cudnnBackendDestroyDescriptor"},
+		"cudnnCTCLossDescriptor_t":            {"cudnnDestroyCTCLossDescriptor"},
+		"cudnnConvolutionDescriptor_t":        {"cudnnDestroyConvolutionDescriptor"},
 		"cudnnDropoutDescriptor_t":            {"cudnnDestroyDropoutDescriptor"},
-		"cudnnPersistentRNNPlan_t":            {"cudnnDestroyPersistentRNNPlan"},
+		"cudnnFilterDescriptor_t":             {"cudnnDestroyFilterDescriptor"},
+		"cudnnFusedOpsConstParamPack_t":       {"cudnnDestroyFusedOpsConstParamPack"},
+		"cudnnFusedOpsPlan_t":                 {"cudnnDestroyFusedOpsPlan"},
+		"cudnnFusedOpsVariantParamPack_t":     {"cudnnDestroyFusedOpsVariantParamPack"},
 		"cudnnHandle_t":                       {"cudnnDestroy"},
+		"cudnnLRNDescriptor_t":                {"cudnnDestroyLRNDescriptor"},
 		"cudnnOpTensorDescriptor_t":           {"cudnnDestroyOpTensorDescriptor"},
+		"cudnnPersistentRNNPlan_t":            {"cudnnDestroyPersistentRNNPlan"},
+		"cudnnPoolingDescriptor_t":            {"cudnnDestroyPoolingDescriptor"},
+		"cudnnRNNDataDescriptor_t":            {"cudnnDestroyRNNDataDescriptor"},
 		"cudnnRNNDescriptor_t":                {"cudnnDestroyRNNDescriptor"},
-		"cudnnConvolutionDescriptor_t":        {"cudnnDestroyConvolutionDescriptor"},
-		"cudnnLRNDescriptor_t":                {"cudnnDestroyLRNDescriptor"},
+		"cudnnReduceTensorDescriptor_t":       {"cudnnDestroyReduceTensorDescriptor"},
+		"cudnnSeqDataDescriptor_t":            {"cudnnDestroySeqDataDescriptor"},
+		"cudnnSpatialTransformerDescriptor_t": {"cudnnDestroySpatialTransformerDescriptor"},
 		"cudnnTensorDescriptor_t":             {"cudnnDestroyTensorDescriptor"},
-		"cudnnFilterDescriptor_t":             {"cudnnDestroyFilterDescriptor"},
-		"cudnnActivationDescriptor_t":         {"cudnnDestroyActivationDescriptor"},
-		"cudnnCTCLossDescriptor_t":            {"cudnnDestroyCTCLossDescriptor"},
+		"cudnnTensorTransformDescriptor_t":    {"cudnnDestroyTensorTransformDescriptor"},
+		"cudnnAlgorithmDescriptor_t":          {"cudnnDestroyAlgorithmDescriptor"},
 	}
 
 	methods = map[string][]string{
-		"cudnnOpTensorDescriptor_t":     {"cudnnGetOpTensorDescriptor"},
-		"cudnnReduceTensorDescriptor_t": {"cudnnGetReduceTensorDescriptor"},
-		"cudnnPoolingDescriptor_t":      {"cudnnGetPooling2dDescriptor", "cudnnGetPoolingNdDescriptor", "cudnnGetPoolingNdForwardOutputDim", "cudnnGetPooling2dForwardOutputDim"},
-		"cudnnActivationDescriptor_t":   {"cudnnGetActivationDescriptor"},
-		"cudnnDropoutDescriptor_t":      {"cudnnRestoreDropoutDescriptor", "cudnnGetDropoutDescriptor"},
-		"cudnnCTCLossDescriptor_t":      {"cudnnGetCTCLossDescriptor"},
-		"cudnnTensorDescriptor_t":       {"cudnnGetTensor4dDescriptor", "cudnnGetTensorNdDescriptor", "cudnnGetTensorSizeInBytes", "cudnnDeriveBNTensorDescriptor", "cudnnDropoutGetReserveSpaceSize"},
-		"cudnnHandle_t":                 {"cudnnTransformTensor", "cudnnAddTensor", "cudnnOpTensor", "cudnnGetReductionIndicesSize", "cudnnGetReductionWorkspaceSize", "cudnnReduceTensor", "cudnnScaleTensor", "cudnnFindConvolutionForwardAlgorithm", "cudnnFindConvolutionForwardAlgorithmEx", "cudnnGetConvolutionForwardAlgorithm", "cudnnGetConvolutionForwardAlgorithm_v7", "cudnnGetConvolutionForwardWorkspaceSize", "cudnnConvolutionForward", "cudnnConvolutionBiasActivationForward", "cudnnConvolutionBackwardBias", "cudnnGetConvolutionBackwardFilterAlgorithmMaxCount", "cudnnFindConvolutionBackwardFilterAlgorithm", "cudnnFindConvolutionBackwardFilterAlgorithmEx", "cudnnGetConvolutionBackwardFilterAlgorithm", "cudnnGetConvolutionBackwardFilterAlgorithm_v7", "cudnnGetConvolutionBackwardFilterWorkspaceSize", "cudnnConvolutionBackwardFilter", "cudnnGetConvolutionBackwardDataAlgorithmMaxCount", "cudnnFindConvolutionBackwardDataAlgorithm", "cudnnFindConvolutionBackwardDataAlgorithmEx", "cudnnGetConvolutionBackwardDataAlgorithm", "cudnnGetConvolutionBackwardDataAlgorithm_v7", "cudnnGetConvolutionBackwardDataWorkspaceSize", "cudnnConvolutionBackwardData", "cudnnIm2Col", "cudnnSoftmaxForward", "cudnnSoftmaxBackward", "cudnnPoolingForward", "cudnnPoolingBackward", "cudnnActivationForward", "cudnnActivationBackward", "cudnnLRNCrossChannelForward", "cudnnLRNCrossChannelBackward", "cudnnDivisiveNormalizationForward", "cudnnDivisiveNormalizationBackward", "cudnnBatchNormalizationForwardTraining", "cudnnBatchNormalizationForwardInference", "cudnnBatchNormalizationBackward", "cudnnSpatialTfGridGeneratorForward", "cudnnSpatialTfGridGeneratorBackward", "cudnnSpatialTfSamplerForward", "cudnnSpatialTfSamplerBackward", "cudnnDropoutGetStatesSize", "cudnnDropoutForward", "cudnnDropoutBackward", "cudnnGetRNNDescriptor", "cudnnGetRNNWorkspaceSize", "cudnnGetRNNTrainingReserveSize", "cudnnGetRNNParamsSize", "cudnnGetRNNLinLayerMatrixParams", "cudnnGetRNNLinLayerBiasParams", "cudnnRNNForwardInference", "cudnnRNNForwardTraining", "cudnnRNNBackwardData", "cudnnRNNBackwardWeights", "cudnnCTCLoss", "cudnnGetCTCLossWorkspaceSize"},
-		"cudnnFilterDescriptor_t":       {"cudnnGetFilter4dDescriptor", "cudnnGetFilterNdDescriptor"},
-		"cudnnLRNDescriptor_t":          {"cudnnGetLRNDescriptor"},
+		"cudnnHandle_t":            {"cudnnActivationBackward", "cudnnActivationForward", "cudnnAddTensor", "cudnnBatchNormalizationBackward", "cudnnBatchNormalizationForwardInference", "cudnnBatchNormalizationForwardTraining", "cudnnCTCLoss", "cudnnConvolutionBackwardBias", "cudnnConvolutionBackwardData", "cudnnConvolutionBackwardFilter", "cudnnConvolutionBiasActivationForward", "cudnnConvolutionForward", "cudnnDivisiveNormalizationBackward", "cudnnDivisiveNormalizationForward", "cudnnDropoutBackward", "cudnnDropoutForward", "cudnnDropoutGetStatesSize", "cudnnFindConvolutionBackwardDataAlgorithm", "cudnnFindConvolutionBackwardDataAlgorithmEx", "cudnnFindConvolutionBackwardFilterAlgorithm", "cudnnFindConvolutionBackwardFilterAlgorithmEx", "cudnnFindConvolutionForwardAlgorithm", "cudnnFindConvolutionForwardAlgorithmEx", "cudnnGetRNNLinLayerBiasParams", "cudnnGetRNNLinLayerMatrixParams", "cudnnGetRNNParamsSize", "cudnnGetRNNTrainingReserveSize", "cudnnGetRNNWorkspaceSize", "cudnnGetReductionIndicesSize", "cudnnGetReductionWorkspaceSize", "cudnnIm2Col", "cudnnLRNCrossChannelBackward", "cudnnLRNCrossChannelForward", "cudnnOpTensor", "cudnnPoolingBackward", "cudnnPoolingForward", "cudnnRNNBackwardData", "cudnnRNNBackwardWeights", "cudnnRNNForwardInference", "cudnnRNNForwardTraining", "cudnnReduceTensor", "cudnnScaleTensor", "cudnnSoftmaxBackward", "cudnnSoftmaxForward", "cudnnSpatialTfGridGeneratorBackward", "cudnnSpatialTfGridGeneratorForward", "cudnnSpatialTfSamplerBackward", "cudnnSpatialTfSamplerForward", "cudnnTransformTensor"},
+		"cudnnTensorDescriptor_t":  {"cudnnDeriveBNTensorDescriptor", "cudnnDropoutGetReserveSpaceSize"},
+		"cudnnDropoutDescriptor_t": {"cudnnRestoreDropoutDescriptor"},
 	}
 }
diff --git a/cmd/gencudnn/mappings.ods b/cmd/gencudnn/mappings.ods
new file mode 100644
index 0000000..5290fc6
Binary files /dev/null and b/cmd/gencudnn/mappings.ods differ
diff --git a/cmd/gencudnn/moredecls.go b/cmd/gencudnn/moredecls.go
index 126d710..2c9f580 100644
--- a/cmd/gencudnn/moredecls.go
+++ b/cmd/gencudnn/moredecls.go
@@ -1,440 +1,689 @@
 package main
 
+/* generated by parse.py. DO NOT EDIT */
 var inputParams = map[string][]string{
-	"cudnnActivationBackward":                            {"handle", "activationDesc", "", "alpha", "beta", "yDesc", "y", "dyDesc", "dy", "xDesc", "x", "dxDesc"},
-	"cudnnActivationForward":                             {"handle", "activationDesc", "alpha", "beta", "xDesc", "x", "yDesc"},
-	"cudnnAddTensor":                                     {"handle", "alpha", "beta", "aDesc", "A", "cDesc"},
-	"cudnnBatchNormalizationBackward":                    {"handle", "mode", "alphaDataDiff", "betaDataDiff", "alphaParamDiff", "betaParamDiff", "xDesc", "x", "dyDesc", "dy", "dxDesc", "dx", "bnScaleBiasDiffDesc", "bnScale", "epsilon", "savedMean", "savedInvVariance"},
-	"cudnnBatchNormalizationForwardInference":            {"handle", "mode", "alpha", "beta", "xDesc", "yDesc", "x", "y", "bnScaleBiasMeanVarDesc", "bnScaleData", "bnBiasData", "estimatedMean", "estimatedVariance", "epsilon"},
-	"cudnnBatchNormalizationForwardTraining":             {"handle", "mode", "alpha", "beta", "xDesc", "yDesc", "x", "y", "bnScaleBiasMeanVarDesc", "bnScale", "bnBias", "exponentialAverageFactor", "epsilon"},
-	"cudnnCTCLoss":                                       {"handle", "probsDesc", "probs", "labels", "labelLengths", "inputLengths", "gradientsDesc", "algo", "ctcLossDesc", "workspace", "sizeInBytes"},
-	"cudnnConvolutionBackwardBias":                       {"handle", "alpha", "beta", "dyDesc", "dy", "dbDesc"},
-	"cudnnConvolutionBackwardData":                       {"handle", "alpha", "beta", "wDesc", "w", "dyDesc", "dy", "convDesc", "algo", "workSpace", "workSpaceSizeInBytes", "dxDesc"},
-	"cudnnConvolutionBackwardFilter":                     {"handle", "alpha", "beta", "xDesc", "x", "dyDesc", "dy", "convDesc", "algo", "workSpace", "workSpaceSizeInBytes", "dwDesc"},
-	"cudnnConvolutionBiasActivationForward":              {"handle", "alpha1", "alpha2", "xDesc", "x", "wDesc", "w", "convDesc", "algo", "workSpace", "workSpaceSizeInBytes", "zDesc", "z", "biasDesc", "bias", "activationDesc", "yDesc"},
-	"cudnnConvolutionForward":                            {"handle", "alpha", "beta", "xDesc", "x", "wDesc", "w", "convDesc", "algo", "workSpace", "workSpaceSizeInBytes", "yDesc"},
-	"cudnnCreateTensorDescriptor":                        {"tensorDesc"},
-	"cudnnDeriveBNTensorDescriptor":                      {"xDesc", "mode"},
-	"cudnnDestroy":                                       {"handle"},
-	"cudnnDestroyCTCLossDescriptor":                      {"ctcLossDesc"},
-	"cudnnDestroyOpTensorDescriptor":                     {"opTensorDesc"},
-	"cudnnDestroyReduceTensorDescriptor":                 {"tensorDesc"},
-	"cudnnDestroyTensorDescriptor":                       {"tensorDesc"},
-	"cudnnDivisiveNormalizationBackward":                 {"handle", "normDesc", "mode", "alpha", "beta", "xDesc", "x", "means", "dy", "dxDesc"},
-	"cudnnDivisiveNormalizationForward":                  {"handle", "normDesc", "divNormMode", "alpha", "beta", "xDesc", "yDesc", "x", "means"},
-	"cudnnDropoutBackward":                               {"handle", "dropoutDesc", "dyDesc", "dy", "dxDesc", "reserveSpace", "reserveSpaceSizeInBytes"},
-	"cudnnDropoutForward":                                {"handle", "dropoutDesc", "xDesc", "x", "yDesc", "reserveSpaceSizeInBytes"},
-	"cudnnDropoutGetReserveSpaceSize":                    {"xDesc"},
-	"cudnnDropoutGetStatesSize":                          {"handle"},
-	"cudnnFindConvolutionBackwardDataAlgorithm":          {"handle", "wDesc", "dyDesc", "convDesc", "dxDesc", "requestedAlgoCount"},
-	"cudnnFindConvolutionBackwardDataAlgorithmEx":        {"handle", "wDesc", "w", "dyDesc", "dy", "convDesc", "dxDesc", "requestedAlgoCount", "workSpace", "workSpaceSizeInBytes"},
-	"cudnnFindConvolutionBackwardFilterAlgorithm":        {"handle", "xDesc", "dyDesc", "convDesc", "dwDesc", "requestedAlgoCount"},
-	"cudnnFindConvolutionBackwardFilterAlgorithmEx":      {"handle", "xDesc", "x", "dyDesc", "dy", "convDesc", "dwDesc", "requestedAlgoCount", "workSpace", "workSpaceSizeInBytes"},
-	"cudnnFindConvolutionForwardAlgorithm":               {"handle", "xDesc", "wDesc", "convDesc", "yDesc", "requestedAlgoCount"},
-	"cudnnFindConvolutionForwardAlgorithmEx":             {"handle", "xDesc", "x", "wDesc", "w", "convDesc", "yDesc", "requestedAlgoCount", "workSpace", "workSpaceSizeInBytes"},
-	"cudnnFindRNNBackwardDataAlgorithmEx":                {"handle", "rnnDesc", "seqLength", "yDesc", "y", "dyDesc", "dy", "dhyDesc", "dhy", "dcyDesc", "dcy", "wDesc", "w", "hxDesc", "hx", "cxDesc", "cx", "dxDesc", "dhxDesc", "dcxDesc", "findIntensity", "requestedAlgoCount", "workspace", "workSpaceSizeInBytes", "reserveSpaceSizeInBytes"},
-	"cudnnFindRNNBackwardWeightsAlgorithmEx":             {"handle", "rnnDesc", "seqLength", "xDesc", "x", "hxDesc", "hx", "yDesc", "y", "findIntensity", "requestedAlgoCount", "workspace", "workSpaceSizeInBytes", "dwDesc", "reserveSpace", "reserveSpaceSizeInBytes"},
-	"cudnnFindRNNForwardInferenceAlgorithmEx":            {"handle", "rnnDesc", "seqLength", "xDesc", "x", "hxDesc", "hx", "cxDesc", "cx", "wDesc", "w", "yDesc", "hyDesc", "cyDesc", "findIntensity", "requestedAlgoCount", "workspace", "workSpaceSizeInBytes"},
-	"cudnnFindRNNForwardTrainingAlgorithmEx":             {"handle", "rnnDesc", "xDesc", "seqLength", "x", "hxDesc", "hx", "cxDesc", "cx", "wDesc", "w", "yDesc", "hyDesc", "cyDesc", "findIntensity", "requestedAlgoCount", "workspace", "workSpaceSizeInBytes", "reserveSpaceSizeInBytes"},
-	"cudnnGetActivationDescriptor":                       {"activationDesc"},
-	"cudnnGetAlgorithmDescriptor":                        {"algorithmDesc", "algorithm"},
-	"cudnnGetAlgorithmSpaceSize":                         {"handle", "algoDesc"},
-	"cudnnGetCTCLossDescriptor":                          {"ctcLossDesc"},
-	"cudnnGetCTCLossWorkspaceSize":                       {"handle", "probsDesc", "gradientsDesc", "labels", "labelLengths", "inputLengths", "algo", "ctcLossDesc"},
-	"cudnnGetConvolution2dForwardOutputDim":              {"convDesc", "inputTensorDesc", "filterDesc"},
-	"cudnnGetConvolutionBackwardDataAlgorithm":           {"handle", "wDesc", "dyDesc", "convDesc", "dxDesc", "preference", "memoryLimitInBytes"},
-	"cudnnGetConvolutionBackwardDataAlgorithmMaxCount":   {"handle"},
-	"cudnnGetConvolutionBackwardDataAlgorithm_v7":        {"handle", "wDesc", "dyDesc", "convDesc", "dxDesc", "requestedAlgoCount"},
-	"cudnnGetConvolutionBackwardDataWorkspaceSize":       {"handle", "wDesc", "dyDesc", "convDesc", "dxDesc", "algo"},
-	"cudnnGetConvolutionBackwardFilterAlgorithm":         {"handle", "xDesc", "dyDesc", "convDesc", "dwDesc", "preference", "memoryLimitInBytes"},
-	"cudnnGetConvolutionBackwardFilterAlgorithmMaxCount": {"handle"},
-	"cudnnGetConvolutionBackwardFilterAlgorithm_v7":      {"handle", "xDesc", "dyDesc", "convDesc", "dwDesc", "requestedAlgoCount"},
-	"cudnnGetConvolutionBackwardFilterWorkspaceSize":     {"handle", "xDesc", "dyDesc", "convDesc", "dwDesc", "algo"},
-	"cudnnGetConvolutionForwardAlgorithm":                {"handle", "xDesc", "wDesc", "convDesc", "yDesc", "preference", "memoryLimitInBytes"},
-	"cudnnGetConvolutionForwardAlgorithmMaxCount":        {"handle"},
-	"cudnnGetConvolutionForwardAlgorithm_v7":             {"handle", "xDesc", "wDesc", "convDesc", "yDesc", "requestedAlgoCount"},
-	"cudnnGetConvolutionForwardWorkspaceSize":            {"handle", "xDesc", "wDesc", "convDesc", "yDesc", "algo"},
-	"cudnnGetConvolutionNdDescriptor":                    {"arrayLengthRequested"},
-	"cudnnGetConvolutionNdForwardOutputDim":              {"convDesc", "inputTensorDesc", "filterDesc", "nbDims"},
-	"cudnnGetDropoutDescriptor":                          {"dropoutDesc", "handle"},
-	"cudnnGetErrorString":                                {"status"},
-	"cudnnGetFilter4dDescriptor":                         {"filterDesc"},
-	"cudnnGetFilterNdDescriptor":                         {"wDesc", "nbDimsRequested"},
-	"cudnnGetOpTensorDescriptor":                         {"opTensorDesc"},
-	"cudnnGetPooling2dDescriptor":                        {"poolingDesc"},
-	"cudnnGetPooling2dForwardOutputDim":                  {"poolingDesc", "inputDesc"},
-	"cudnnGetPoolingNdDescriptor":                        {"poolingDesc", "nbDimsRequested", "maxpoolingNanOpt"},
-	"cudnnGetPoolingNdForwardOutputDim":                  {"poolingDesc", "inputDesc", "nbDims"},
-	"cudnnGetProperty":                                   {"type"},
-	"cudnnGetRNNDataDescriptor":                          {"RNNDataDesc", "arrayLengthRequested"},
-	"cudnnGetRNNDescriptor":                              {"handle", "rnnDesc"},
-	"cudnnGetRNNLinLayerBiasParams":                      {"handle", "rnnDesc", "pseudoLayer", "xDesc", "wDesc", "w", "linLayerID"},
-	"cudnnGetRNNLinLayerMatrixParams":                    {"handle", "rnnDesc", "pseudoLayer", "xDesc", "wDesc", "w", "linLayerID"},
-	"cudnnGetRNNParamsSize":                              {"handle", "rnnDesc", "xDesc", "dataType"},
-	"cudnnGetRNNPaddingMode":                             {"*paddingMode"},
-	"cudnnGetRNNProjectionLayers":                        {"handle", "rnnDesc"},
-	"cudnnGetRNNTrainingReserveSize":                     {"handle", "rnnDesc", "seqLength", "xDesc"},
-	"cudnnGetRNNWorkspaceSize":                           {"handle", "rnnDesc", "seqLength", "xDesc"},
-	"cudnnGetReduceTensorDescriptor":                     {"reduceTensorDesc", "reduceTensorNanOpt"},
-	"cudnnGetReductionIndicesSize":                       {"handle", "reduceDesc", "aDesc", "cDesc"},
-	"cudnnGetReductionWorkspaceSize":                     {"handle", "reduceDesc", "aDesc", "cDesc"},
-	"cudnnGetStream":                                     {"handle"},
-	"cudnnGetTensor4dDescriptor":                         {"tensorDesc"},
-	"cudnnGetTensorNdDescriptor":                         {"tensorDesc", "nbDimsRequested", "strideA"},
-	"cudnnGetTensorSizeInBytes":                          {"tensorDesc"},
-	"cudnnIm2Col":                                        {"handle", "srcDesc", "srcData", "filterDesc", "convDesc"},
-	"cudnnLRNCrossChannelBackward":                       {"handle", "normDesc", "lrnMode", "alpha", "beta", "yDesc", "y", "dyDesc", "dy", "xDesc", "x"},
-	"cudnnLRNCrossChannelForward":                        {"handle", "normDesc", "lrnMode", "alpha", "beta", "xDesc", "yDesc", "x"},
-	"cudnnOpTensor":                                      {"handle", "opTensorDesc", "alpha1", "alpha2", "beta", "aDesc", "bDesc", "cDesc", "A", "B"},
-	"cudnnPoolingBackward":                               {"handle", "poolingDesc", "alpha", "beta", "yDesc", "y", "dyDesc", "dy", "xDesc", "x", "dxDesc"},
-	"cudnnPoolingForward":                                {"handle", "poolingDesc", "alpha", "beta", "xDesc", "x", "yDesc"},
-	"cudnnQueryRuntimeError":                             {"handle", "mode"},
-	"cudnnRNNBackwardData":                               {"handle", "rnnDesc", "seqLength", "yDesc", "y", "dyDesc", "dy", "dhyDesc", "dhy", "dcyDesc", "dcy", "wDesc", "w", "hxDesc", "hx", "cxDesc", "cx", "dxDesc", "dhxDesc", "dcxDesc", "workspace", "workSpaceSizeInBytes", "reserveSpaceSizeInBytes"},
-	"cudnnRNNBackwardDataEx":                             {"handle", "rnnDesc", "yDesc", "y", "dyDesc", "dy", "dhyDesc", "dhy", "dcyDesc", "dcy", "wDesc", "w", "hxDesc", "hx", "cxDesc", "cx", "dxDesc", "dhxDesc", "dcxDesc", "dkDesc", "dkeys", "workspace", "workSpaceSizeInBytes", "reserveSpaceSizeInBytes"},
-	"cudnnRNNBackwardWeights":                            {"handle", "rnnDesc", "seqLength", "xDesc", "x", "hxDesc", "hx", "yDesc", "y", "workspace", "workSpaceSizeInBytes", "dwDesc", "reserveSpace", "reserveSpaceSizeInBytes"},
-	"cudnnRNNBackwardWeightsEx":                          {"handle", "rnnDesc", "seqLength", "xDesc", "x", "hxDesc", "hx", "yDesc", "y", "workspace", "workSpaceSizeInBytes", "dwDesc", "reserveSpace", "reserveSpaceSizeInBytes"},
-	"cudnnRNNForwardInference":                           {"handle", "rnnDesc", "seqLength", "xDesc", "x", "hxDesc", "hx", "cxDesc", "cx", "wDesc", "w", "yDesc", "hyDesc", "cyDesc", "workspace", "workSpaceSizeInBytes"},
-	"cudnnRNNForwardInferenceEx":                         {"handle", "rnnDesc", "xDesc", "x", "hxDesc", "hx", "cxDesc", "cx", "wDesc", "w", "yDesc", "hyDesc", "cyDesc", "kDesc", "Keys", "cDesc", "cAttn", "iDesc", "iAttn", "qDesc", "Queries", "workspace", "workSpaceSizeInBytes"},
-	"cudnnRNNForwardTraining":                            {"handle", "rnnDesc", "seqLength", "xDesc", "x", "hxDesc", "hx", "cxDesc", "cx", "wDesc", "w", "yDesc", "hyDesc", "cyDesc", "workspace", "workSpaceSizeInBytes", "reserveSpaceSizeInBytes"},
-	"cudnnRNNForwardTrainingEx":                          {"handle", "rnnDesc", "xDesc", "x", "hxDesc", "hx", "cxDesc", "cx", "wDesc", "w", "yDesc", "hyDesc", "cyDesc", "kDesc", "Keys", "cDesc", "cAttn", "iDesc", "iAttn", "qDesc", "Queries", "workspace", "workSpaceSizeInBytes", "reserveSpaceSizeInBytes"},
-	"cudnnRNNSetClip":                                    {"clipMode", "lclip", "rclip", "clipNanOpt"},
-	"cudnnReduceTensor":                                  {"handle", "reduceTensorDesc", "indicesSizeInBytes", "workspace", "workspaceSizeInBytes", "alpha", "beta", "aDesc", "cDesc", "A"},
-	"cudnnRestoreAlgorithm":                              {"handle", "algoDesc", "algoSpace", "algoSpaceSizeInBytes"},
-	"cudnnRestoreDropoutDescriptor":                      {"handle", "dropout", "states", "stateSizeInBytes", "seed"},
-	"cudnnSaveAlgorithm":                                 {"handle", "algoDesc", "algoSpace", "algoSpaceSizeInBytes"},
-	"cudnnScaleTensor":                                   {"handle", "yDesc", "alpha"},
-	"cudnnSetActivationDescriptor":                       {"mode", "reluNanOpt", "coef"},
-	"cudnnSetAlgorithmDescriptor":                        {"algorithm"},
-	"cudnnSetAlgorithmPerformance":                       {"algoDesc", "status", "time", "memory"},
-	"cudnnSetCTCLossDescriptor":                          {"compType"},
-	"cudnnSetCallback":                                   {"mask", "udata", "fptr"},
-	"cudnnSetConvolution2dDescriptor":                    {"pad_h", "pad_w", "u", "v", "dilation_h", "dilation_w", "mode", "computeType"},
-	"cudnnSetConvolutionNdDescriptor":                    {"arrayLength", "padA", "filterStrideA", "dilationA", "mode", "datatype"},
-	"cudnnSetDropoutDescriptor":                          {"handle", "dropout", "stateSizeInBytes", "seed"},
-	"cudnnSetFilter4dDescriptor":                         {"datatype", "format", "k", "c", "h", "w"},
-	"cudnnSetFilterNdDescriptor":                         {"datatype", "format", "nbDims", "filterDimA"},
-	"cudnnSetLRNDescriptor":                              {"lrnN", "lrnAlpha", "lrnBeta", "lrnK"},
-	"cudnnSetOpTensorDescriptor":                         {"opTensorOp", "opTensorCompType", "opTensorNanOpt"},
-	"cudnnSetPooling2dDescriptor":                        {"mode", "maxpoolingNanOpt", "windowHeight", "windowWidth", "verticalPadding", "horizontalPadding", "verticalStride", "horizontalStride"},
-	"cudnnSetPoolingNdDescriptor":                        {"mode", "maxpoolingNanOpt", "nbDims"},
-	"cudnnSetRNNDataDescriptor":                          {"dataType", "layout", "maxSeqLength", "batchSize", "vectorSize", "seqLengthArray", "paddingFill"},
-	"cudnnSetRNNDescriptor":                              {"hiddenSize", "numLayers", "dropoutDesc", "inputMode", "direction", "mode", "dataType"},
-	"cudnnSetRNNDescriptor_v5":                           {"hiddenSize", "numLayers", "dropoutDesc", "inputMode", "direction", "mode", "dataType"},
-	"cudnnSetRNNDescriptor_v6":                           {"handle", "hiddenSize", "numLayers", "dropoutDesc", "inputMode", "direction", "mode", "algo", "dataType"},
-	"cudnnSetRNNMatrixMathType":                          {"rnnDesc", "mType"},
-	"cudnnSetRNNPaddingMode":                             {"paddingMode"},
-	"cudnnSetRNNProjectionLayers":                        {"handle", "rnnDesc", "recProjSize", "outProjSize"},
-	"cudnnSetReduceTensorDescriptor":                     {"reduceTensorOp", "reduceTensorCompType", "reduceTensorNanOpt", "reduceTensorIndices", "reduceTensorIndicesType"},
-	"cudnnSetSpatialTransformerNdDescriptor":             {"samplerType", "dataType", "nbDims", "dimA"},
-	"cudnnSetStream":                                     {"handle", "streamID"},
-	"cudnnSetTensor":                                     {"handle", "yDesc", "valuePtr"},
-	"cudnnSetTensor4dDescriptor":                         {"format", "datatype", "n", "c", "h", "w"},
-	"cudnnSetTensor4dDescriptorEx":                       {"datatype", "n", "c", "h", "w", "nStride", "cStride", "hStride", "wStride"},
-	"cudnnSetTensorNdDescriptor":                         {"datatype", "nbDims", "dimA", "strideA"},
-	"cudnnSetTensorNdDescriptorEx":                       {"format", "dataType", "nbDims", "dimA"},
-	"cudnnSoftmaxBackward":                               {"handle", "algorithm", "mode", "alpha", "beta", "yDesc", "y", "dyDesc", "dy", "dxDesc"},
-	"cudnnSoftmaxForward":                                {"handle", "algorithm", "mode", "alpha", "beta", "xDesc", "x", "yDesc"},
-	"cudnnSpatialTfGridGeneratorBackward":                {"handle", "stDesc", "dgrid"},
-	"cudnnSpatialTfGridGeneratorForward":                 {"handle", "stDesc", "theta"},
-	"cudnnSpatialTfSamplerBackward":                      {"handle", "stDesc", "alpha", "beta", "xDesc", "x", "dxDesc", "alphaDgrid", "betaDgrid", "dyDesc", "dy", "grid"},
-	"cudnnSpatialTfSamplerForward":                       {"handle", "stDesc", "alpha", "beta", "xDesc", "x", "grid", "yDesc"},
-	"cudnnTransformTensor":                               {"handle", "alpha", "beta", "xDesc", "x", "yDesc"},
+	"cudnnActivationForward":                                   {"handle", "activationDesc", "alpha", "beta", "xDesc", "x", "yDesc"},
+	"cudnnAddTensor":                                           {"handle", "alpha", "beta", "aDesc", "A", "cDesc"},
+	"cudnnBatchNormalizationForwardInference":                  {"handle", "mode", "alpha", "beta", "xDesc", "yDesc", "*x", "*y", "bnScaleBiasMeanVarDesc", "bnScale", "bnBias", "estimatedMean", "estimatedVariance", "epsilon"},
+	"cudnnCreateActivationDescriptor":                          {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_ALLOC_FAILED"},
+	"cudnnCreateAlgorithmDescriptor":                           {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_ALLOC_FAILED"},
+	"cudnnCreateAlgorithmPerformance":                          {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_ALLOC_FAILED"},
+	"cudnnCreateDropoutDescriptor":                             {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_ALLOC_FAILED"},
+	"cudnnCreateFilterDescriptor":                              {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_ALLOC_FAILED"},
+	"cudnnCreateLRNDescriptor":                                 {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_ALLOC_FAILED"},
+	"cudnnCreatePoolingDescriptor":                             {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_ALLOC_FAILED"},
+	"cudnnCreateReduceTensorDescriptor":                        {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_BAD_PARAM", "CUDNN_STATUS_ALLOC_FAILED"},
+	"cudnnCreateSpatialTransformerDescriptor":                  {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_ALLOC_FAILED"},
+	"cudnnCreateTensorDescriptor":                              {"tensorDesc"},
+	"cudnnDeriveBNTensorDescriptor":                            {"xDesc", "mode"},
+	"cudnnDeriveNormTensorDescriptor":                          {"xDesc", "mode"},
+	"cudnnDestroy":                                             {"handle"},
+	"cudnnDestroyActivationDescriptor":                         {"CUDNN_STATUS_SUCCESS"},
+	"cudnnDestroyAlgorithmDescriptor":                          {"CUDNN_STATUS_SUCCESS"},
+	"cudnnDestroyAlgorithmPerformance":                         {"CUDNN_STATUS_SUCCESS"},
+	"cudnnDestroyDropoutDescriptor":                            {"CUDNN_STATUS_SUCCESS"},
+	"cudnnDestroyFilterDescriptor":                             {"CUDNN_STATUS_SUCCESS"},
+	"cudnnDestroyLRNDescriptor":                                {"CUDNN_STATUS_SUCCESS"},
+	"cudnnDestroyOpTensorDescriptor":                           {"opTensorDesc"},
+	"cudnnDestroyPoolingDescriptor":                            {"CUDNN_STATUS_SUCCESS"},
+	"cudnnDestroyReduceTensorDescriptor":                       {"tensorDesc"},
+	"cudnnDestroySpatialTransformerDescriptor":                 {"CUDNN_STATUS_SUCCESS"},
+	"cudnnDestroyTensorDescriptor":                             {"tensorDesc"},
+	"cudnnDestroyTensorTransformDescriptor":                    {"transformDesc"},
+	"cudnnDivisiveNormalizationForward":                        {"handle", "normDesc", "divNormMode", "alpha", "beta", "xDesc", "yDesc", "x", "means"},
+	"cudnnDropoutForward":                                      {"handle", "dropoutDesc", "xDesc", "x", "yDesc", "reserveSpaceSizeInBytes"},
+	"cudnnDropoutGetReserveSpaceSize":                          {"xDesc"},
+	"cudnnDropoutGetStatesSize":                                {"handle"},
+	"cudnnGetActivationDescriptor":                             {"activationDesc"},
+	"cudnnGetAlgorithmDescriptor":                              {"algorithmDesc", "algorithm"},
+	"cudnnGetAlgorithmSpaceSize":                               {"handle", "algoDesc"},
+	"cudnnGetDropoutDescriptor":                                {"dropoutDesc", "handle"},
+	"cudnnGetErrorString":                                      {"status"},
+	"cudnnGetFilter4dDescriptor":                               {"filterDesc"},
+	"cudnnGetFilterNdDescriptor":                               {"wDesc", "nbDimsRequested"},
+	"cudnnGetFilterSizeInBytes":                                {"filterDesc"},
+	"cudnnGetOpTensorDescriptor":                               {"opTensorDesc"},
+	"cudnnGetPooling2dDescriptor":                              {"poolingDesc"},
+	"cudnnGetPooling2dForwardOutputDim":                        {"poolingDesc", "inputDesc"},
+	"cudnnGetPoolingNdDescriptor":                              {"poolingDesc", "nbDimsRequested", "maxpoolingNanOpt"},
+	"cudnnGetPoolingNdForwardOutputDim":                        {"poolingDesc", "inputDesc", "nbDims"},
+	"cudnnGetProperty":                                         {"type"},
+	"cudnnGetReduceTensorDescriptor":                           {"reduceTensorDesc", "reduceTensorNanOpt"},
+	"cudnnGetReductionIndicesSize":                             {"handle", "reduceDesc", "aDesc", "cDesc"},
+	"cudnnGetReductionWorkspaceSize":                           {"handle", "reduceDesc", "aDesc", "cDesc"},
+	"cudnnGetStream":                                           {"handle"},
+	"cudnnGetTensor4dDescriptor":                               {"tensorDesc"},
+	"cudnnGetTensorNdDescriptor":                               {"tensorDesc", "nbDimsRequested"},
+	"cudnnGetTensorSizeInBytes":                                {"tensorDesc"},
+	"cudnnGetTensorTransformDescriptor":                        {"transformDesc"},
+	"cudnnInitTransformDest":                                   {"transformDesc", "srcDesc"},
+	"cudnnLRNCrossChannelForward":                              {"handle", "normDesc", "lrnMode", "alpha", "beta", "xDesc", "yDesc", "x"},
+	"cudnnNormalizationForwardInference":                       {"handle", "mode", "normOps", "algo", "alpha", "beta", "xDesc", "yDesc", "*x", "zDesc", "*z", "normScaleBiasDesc", "normScale", "normBias", "normMeanVarDesc", "estimatedMean", "estimatedVariance", "activationDesc", "epsilon", "groutCnt"},
+	"cudnnOpsInferVersionCheck":                                {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_VERSION_MISMATCH"},
+	"cudnnOpTensor":                                            {"handle", "opTensorDesc", "alpha1", "alpha2", "beta", "aDesc", "bDesc", "cDesc", "A", "B"},
+	"cudnnPoolingForward":                                      {"handle", "poolingDesc", "alpha", "beta", "xDesc", "x", "yDesc"},
+	"cudnnQueryRuntimeError":                                   {"handle", "mode"},
+	"cudnnReduceTensor":                                        {"handle", "reduceTensorDesc", "indicesSizeInBytes", "workspace", "workspaceSizeInBytes", "alpha", "beta", "aDesc", "cDesc", "A"},
+	"cudnnRestoreAlgorithm":                                    {"handle", "algoDesc", "algoSpace", "algoSpaceSizeInBytes"},
+	"cudnnRestoreDropoutDescriptor":                            {"handle", "dropout", "states", "stateSizeInBytes", "seed"},
+	"cudnnSaveAlgorithm":                                       {"handle", "algoDesc", "algoSpace", "algoSpaceSizeInBytes"},
+	"cudnnScaleTensor":                                         {"handle", "yDesc", "alpha"},
+	"cudnnSetActivationDescriptor":                             {"mode", "reluNanOpt", "coef"},
+	"cudnnSetAlgorithmDescriptor":                              {"algorithm"},
+	"cudnnSetAlgorithmPerformance":                             {"algoDesc", "status", "time", "memory"},
+	"cudnnSetCallback":                                         {"mask", "udata", "fptr"},
+	"cudnnSetDropoutDescriptor":                                {"handle", "dropout", "stateSizeInBytes", "seed"},
+	"cudnnSetFilter4dDescriptor":                               {"datatype", "format", "k", "c", "h", "w"},
+	"cudnnSetFilterNdDescriptor":                               {"datatype", "format", "nbDims", "filterDimA"},
+	"cudnnSetLRNDescriptor":                                    {"lrnN", "lrnAlpha", "lrnBeta", "lrnK"},
+	"cudnnSetOpTensorDescriptor":                               {"opTensorOp", "opTensorCompType", "opTensorNanOpt"},
+	"cudnnSetPooling2dDescriptor":                              {"mode", "maxpoolingNanOpt", "windowHeight", "windowWidth", "verticalPadding", "horizontalPadding", "verticalStride", "horizontalStride"},
+	"cudnnSetPoolingNdDescriptor":                              {"mode", "maxpoolingNanOpt", "nbDims", "windowDimA", "paddingA", "strideA"},
+	"cudnnSetReduceTensorDescriptor":                           {"reduceTensorOp", "reduceTensorCompType", "reduceTensorNanOpt", "reduceTensorIndices", "reduceTensorIndicesType"},
+	"cudnnSetSpatialTransformerNdDescriptor":                   {"samplerType", "dataType", "nbDims", "dimA"},
+	"cudnnSetStream":                                           {"handle", "streamID"},
+	"cudnnSetTensor":                                           {"handle", "yDesc", "valuePtr"},
+	"cudnnSetTensor4dDescriptor":                               {"format", "datatype", "n", "c", "h", "w"},
+	"cudnnSetTensor4dDescriptorEx":                             {"datatype", "n", "c", "h", "w", "nStride", "cStride", "hStride", "wStride"},
+	"cudnnSetTensorNdDescriptor":                               {"datatype", "nbDims", "dimA", "strideA"},
+	"cudnnSetTensorNdDescriptorEx":                             {"format", "dataType", "nbDims", "dimA"},
+	"cudnnSetTensorTransformDescriptor":                        {"nbDims", "destFormat", "padBeforeA[]", "padAfterA[]", "foldA[]", "direction"},
+	"cudnnSoftmaxForward":                                      {"handle", "algorithm", "mode", "alpha", "beta", "xDesc", "x", "yDesc"},
+	"cudnnSpatialTfGridGeneratorForward":                       {"handle", "stDesc", "theta"},
+	"cudnnSpatialTfSamplerForward":                             {"handle", "stDesc", "alpha", "beta", "xDesc", "x", "grid", "yDesc"},
+	"cudnnTransformFilter":                                     {"handle", "transDesc", "alpha", "beta", "srcDesc", "destDesc", "srcData", "destData"},
+	"cudnnTransformTensor":                                     {"handle", "alpha", "beta", "xDesc", "x", "yDesc"},
+	"cudnnTransformTensorEx":                                   {"handle", "transDesc", "alpha", "beta", "srcDesc", "destDesc", "srcData", "destData"},
+	"cudnnActivationBackward":                                  {"handle", "activationDesc", "alpha", "beta", "yDesc", "y", "dyDesc", "dy", "xDesc", "x", "dxDesc"},
+	"cudnnBatchNormalizationBackward":                          {"handle", "mode", "*alphaDataDiff", "*betaDataDiff", "*alphaParamDiff", "*betaParamDiff", "xDesc", "dxDesc", "dyDesc", "*x", "*dy", "*dx", "bnScaleBiasDiffDesc", "*bnScale", "epsilon", "*savedMean", "*savedInvVariance"},
+	"cudnnBatchNormalizationBackwardEx":                        {"handle", "mode", "bnOps", "*alphaDataDiff", "*betaDataDiff", "*alphaParamDiff", "*betaParamDiff", "xDesc", "*x", "yDesc", "*yData", "dyDesc", "*dyData", "dBnScaleBiasDesc", "*bnScaleData", "*bnBiasData", "*dBnScaleData", "dBnBiasData", "epsilon", "*savedMean", "*savedInvVariance", "activationDesc", "workspace", "workSpaceSizeInBytes", "*reserveSpace", "reserveSpaceSizeInBytes"},
+	"cudnnBatchNormalizationForwardTraining":                   {"handle", "mode", "alpha", "beta", "xDesc", "yDesc", "*x", "*y", "bnScaleBiasMeanVarDesc", "bnScale", "bnBias", "exponentialAverageFactor", "epsilon"},
+	"cudnnBatchNormalizationForwardTrainingEx":                 {"handle", "mode", "bnOps", "*alpha", "*beta", "xDesc", "*xData", "zDesc", "*zData", "yDesc", "*yData", "bnScaleBiasMeanVarDesc", "*bnScaleData", "*bnBiasData", "exponentialAverageFactor", "epsilon", "activationDesc", "*workspace", "workSpaceSizeInBytes", "*reserveSpace", "reserveSpaceSizeInBytes"},
+	"cudnnDivisiveNormalizationBackward":                       {"handle", "normDesc", "mode", "alpha", "beta", "xDesc", "x", "means", "dy", "dxDesc"},
+	"cudnnDropoutBackward":                                     {"handle", "dropoutDesc", "dyDesc", "dy", "dxDesc", "reserveSpace", "reserveSpaceSizeInBytes"},
+	"cudnnGetBatchNormalizationBackwardExWorkspaceSize":        {"handle", "mode", "bnOps", "xDesc", "yDesc", "dyDesc", "dzDesc", "dxDesc", "dBnScaleBiasDesc", "activationDesc"},
+	"cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize": {"handle", "mode", "bnOps", "xDesc", "zDesc", "yDesc", "bnScaleBiasMeanVarDesc", "activationDesc"},
+	"cudnnGetBatchNormalizationTrainingExReserveSpaceSize":     {"handle", "mode", "bnOps", "xDesc", "activationDesc"},
+	"cudnnGetNormalizationBackwardWorkspaceSize":               {"handle", "mode", "normOps", "algo", "xDesc", "yDesc", "dyDesc", "dzDesc", "dxDesc", "dNormScaleBiasDesc", "activationDesc", "normMeanVarDesc", "groutCnt"},
+	"cudnnGetNormalizationForwardTrainingWorkspaceSize":        {"handle", "mode", "normOps", "algo", "xDesc", "zDesc", "yDesc", "normScaleBiasDesc", "activationDesc", "normMeanVarDesc", "groutCnt"},
+	"cudnnGetNormalizationTrainingReserveSpaceSize":            {"handle", "mode", "normOps", "algo", "xDesc", "activationDesc", "groutCnt"},
+	"cudnnLRNCrossChannelBackward":                             {"handle", "normDesc", "lrnMode", "alpha", "beta", "yDesc", "y", "dyDesc", "dy", "xDesc", "x"},
+	"cudnnNormalizationBackward":                               {"dNormScaleBiasDesc", "*normScaleData", "*normBiasData", "*dNormScaleData", "dNormBiasData", "epsilon", "normMeanVarDesc", "*savedMean", "*savedInvVariance", "activationDesc", "workspace", "workSpaceSizeInBytes", "*reserveSpace", "reserveSpaceSizeInBytes", "groutCnt"},
+	"cudnnNormalizationForwardTraining":                        {"handle", "mode", "normOps", "algo", "*alpha", "*beta", "xDesc", "yDesc", "*xData", "zDesc", "*zData", "normScaleBiasDesc", "normScale", "normBias", "exponentialAverageFactor", "normMeanVarDesc", "epsilon", "activationDesc", "*workspace", "workSpaceSizeInBytes", "*reserveSpace", "reserveSpaceSizeInBytes", "groutCnt"},
+	"cudnnOpsTrainVersionCheck":                                {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_VERSION_MISMATCH"},
+	"cudnnPoolingBackward":                                     {"handle", "poolingDesc", "alpha", "beta", "yDesc", "y", "dyDesc", "dy", "xDesc", "x", "dxDesc"},
+	"cudnnSoftmaxBackward":                                     {"handle", "algorithm", "mode", "alpha", "beta", "yDesc", "y", "dyDesc", "dy", "dxDesc"},
+	"cudnnSpatialTfGridGeneratorBackward":                      {"handle", "stDesc", "dgrid"},
+	"cudnnSpatialTfSamplerBackward":                            {"handle", "stDesc", "alpha", "beta", "xDesc", "x", "dxDesc", "alphaDgrid", "betaDgrid", "dyDesc", "dy", "grid"},
+	"cudnnBackendCreateDescriptor":                             {"descriptorType", "descriptor"},
+	"cudnnBackendDestroyDescriptor":                            {"descriptor"},
+	"cudnnBackendExecute":                                      {"executionPlan", "variantPack"},
+	"cudnnBackendFinalize":                                     {"descriptor"},
+	"cudnnBackendGetAttribute":                                 {"descriptor", "attributeName", "attributeType", "requestedElementCount", "elementCount", "arrayOfElements"},
+	"cudnnBackendInitialize":                                   {"descriptor", "descriptorType", "sizeInBytes"},
+	"cudnnBackendSetAttribute":                                 {"descriptor", "attributeName", "attributeType", "elementCount", "arrayOfElements"},
+	"cudnnConvolutionBackwardData":                             {"handle", "alpha", "beta", "wDesc", "w", "dyDesc", "dy", "convDesc", "algo", "workSpace", "workSpaceSizeInBytes", "dxDesc"},
+	"cudnnConvolutionBiasActivationForward":                    {"handle", "alpha1", "alpha2", "xDesc", "x", "wDesc", "w", "convDesc", "algo", "workSpace", "workSpaceSizeInBytes", "zDesc", "z", "biasDesc", "bias", "activationDesc", "yDesc"},
+	"cudnnConvolutionForward":                                  {"handle", "alpha", "beta", "xDesc", "x", "wDesc", "w", "convDesc", "algo", "workSpace", "workSpaceSizeInBytes", "yDesc"},
+	"cudnnCreateConvolutionDescriptor":                         {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_ALLOC_FAILED"},
+	"cudnnDestroyConvolutionDescriptor":                        {"CUDNN_STATUS_SUCCESS"},
+	"cudnnFindConvolutionBackwardDataAlgorithm":                {"handle", "wDesc", "dyDesc", "convDesc", "dxDesc", "requestedAlgoCount"},
+	"cudnnFindConvolutionBackwardDataAlgorithmEx":              {"handle", "wDesc", "w", "dyDesc", "dy", "convDesc", "dxDesc", "requestedAlgoCount", "workSpace", "workSpaceSizeInBytes"},
+	"cudnnFindConvolutionForwardAlgorithm":                     {"handle", "xDesc", "wDesc", "convDesc", "yDesc", "requestedAlgoCount"},
+	"cudnnFindConvolutionForwardAlgorithmEx":                   {"handle", "xDesc", "x", "wDesc", "w", "convDesc", "yDesc", "requestedAlgoCount", "workSpace", "workSpaceSizeInBytes"},
+	"cudnnGetConvolution2dForwardOutputDim":                    {"convDesc", "inputTensorDesc", "filterDesc"},
+	"cudnnGetConvolutionBackwardDataAlgorithmMaxCount":         {"handle"},
+	"cudnnGetConvolutionBackwardDataAlgorithm_v7":              {"handle", "wDesc", "dyDesc", "convDesc", "dxDesc", "requestedAlgoCount"},
+	"cudnnGetConvolutionBackwardDataWorkspaceSize":             {"handle", "wDesc", "dyDesc", "convDesc", "dxDesc", "algo"},
+	"cudnnGetConvolutionForwardAlgorithmMaxCount":              {"handle"},
+	"cudnnGetConvolutionForwardAlgorithm_v7":                   {"handle", "xDesc", "wDesc", "convDesc", "yDesc", "requestedAlgoCount"},
+	"cudnnGetConvolutionForwardWorkspaceSize":                  {"handle", "xDesc", "wDesc", "convDesc", "yDesc", "algo"},
+	"cudnnGetConvolutionGroupCount":                            {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_BAD_PARAM"},
+	"cudnnGetConvolutionMathType":                              {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_BAD_PARAM"},
+	"cudnnGetConvolutionNdDescriptor":                          {"arrayLengthRequested"},
+	"cudnnGetConvolutionNdForwardOutputDim":                    {"convDesc", "inputTensorDesc", "filterDesc", "nbDims"},
+	"cudnnGetConvolutionReorderType":                           {"convDesc"},
+	"cudnnGetFoldedConvBackwardDataDescriptors":                {"handle", "filterDesc", "diffDesc", "convDesc", "gradDesc", "transformFormat"},
+	"cudnnIm2Col":                                              {"handle", "srcDesc", "srcData", "filterDesc", "convDesc"},
+	"cudnnReorderFilterAndBias":                                {"filterDesc", "reorderType", "filterData", "reorderedFilterData", "reorderBias", "biasData", "reorderedBiasData"},
+	"cudnnSetConvolution2dDescriptor":                          {"pad_h", "pad_w", "u", "v", "dilation_h", "dilation_w", "mode", "computeType"},
+	"cudnnSetConvolutionGroupCount":                            {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_BAD_PARAM"},
+	"cudnnSetConvolutionMathType":                              {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_BAD_PARAM"},
+	"cudnnSetConvolutionNdDescriptor":                          {"arrayLength", "padA", "filterStrideA", "dilationA", "mode", "datatype"},
+	"cudnnSetConvolutionReorderType":                           {"convDesc", "reorderType"},
+	"cudnnConvolutionBackwardBias":                             {"handle", "alpha", "beta", "dyDesc", "dy", "dbDesc"},
+	"cudnnConvolutionBackwardFilter":                           {"handle", "alpha", "beta", "xDesc", "x", "dyDesc", "dy", "convDesc", "algo", "workSpace", "workSpaceSizeInBytes", "dwDesc"},
+	"cudnnCreateFusedOpsConstParamPack":                        {"constPack", "ops"},
+	"cudnnCreateFusedOpsPlan":                                  {"plan", "ops"},
+	"cudnnCreateFusedOpsVariantParamPack":                      {"varPack", "ops"},
+	"cudnnDestroyFusedOpsConstParamPack":                       {"constPack"},
+	"cudnnDestroyFusedOpsPlan":                                 {"plan"},
+	"cudnnDestroyFusedOpsVariantParamPack":                     {"varPack"},
+	"cudnnFindConvolutionBackwardFilterAlgorithm":              {"handle", "xDesc", "dyDesc", "convDesc", "dwDesc", "requestedAlgoCount"},
+	"cudnnFindConvolutionBackwardFilterAlgorithmEx":            {"handle", "xDesc", "x", "dyDesc", "dy", "convDesc", "dwDesc", "requestedAlgoCount", "workSpace", "workSpaceSizeInBytes"},
+	"cudnnFusedOpsExecute":                                     {"handle", "plan", "varPack"},
+	"cudnnGetConvolutionBackwardFilterAlgorithmMaxCount":       {"handle"},
+	"cudnnGetConvolutionBackwardFilterAlgorithm_v7":            {"handle", "xDesc", "dyDesc", "convDesc", "dwDesc", "requestedAlgoCount"},
+	"cudnnGetConvolutionBackwardFilterWorkspaceSize":           {"handle", "xDesc", "dyDesc", "convDesc", "dwDesc", "algo"},
+	"cudnnGetFusedOpsConstParamPackAttribute":                  {"constPack", "paramLabel", "param"},
+	"cudnnGetFusedOpsVariantParamPackAttribute":                {"varPack", "paramLabel"},
+	"cudnnMakeFusedOpsPlan":                                    {"handle", "plan", "constPack"},
+	"cudnnSetFusedOpsConstParamPackAttribute":                  {"constPack", "paramLabel", "param"},
+	"cudnnSetFusedOpsVariantParamPackAttribute":                {"varPack", "paramLabel", "ptr"},
+	"cudnnAdvInferVersionCheck":                                {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_VERSION_MISMATCH"},
+	"cudnnBuildRNNDynamic":                                     {"handle", "rnnDesc", "miniBatch"},
+	"cudnnCreatePersistentRNNPlan":                             {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_MAPPING_ERROR", "CUDNN_STATUS_ALLOC_FAILED", "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING", "CUDNN_STATUS_NOT_SUPPORTED"},
+	"cudnnCreateRNNDataDescriptor":                             {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_BAD_PARAM", "CUDNN_STATUS_ALLOC_FAILED"},
+	"cudnnCreateRNNDescriptor":                                 {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_ALLOC_FAILED"},
+	"cudnnDestroyPersistentRNNPlan":                            {"CUDNN_STATUS_SUCCESS"},
+	"cudnnDestroyRNNDataDescriptor":                            {"CUDNN_STATUS_SUCCESS"},
+	"cudnnDestroyRNNDescriptor":                                {"CUDNN_STATUS_SUCCESS"},
+	"cudnnFindRNNForwardInferenceAlgorithmEx":                  {"handle", "rnnDesc", "seqLength", "xDesc", "x", "hxDesc", "hx", "cxDesc", "cx", "wDesc", "w", "yDesc", "hyDesc", "cyDesc", "findIntensity", "requestedAlgoCount", "workspace", "workSpaceSizeInBytes"},
+	"cudnnGetAttnDescriptor":                                   {"attnDesc"},
+	"cudnnGetMultiHeadAttnBuffers":                             {"handle", "attnDesc"},
+	"cudnnGetMultiHeadAttnWeights":                             {"handle", "attnDesc", "wKind", "weightSizeInBytes", "weights"},
+	"cudnnGetRNNBiasMode":                                      {"rnnDesc"},
+	"cudnnGetRNNDataDescriptor":                                {"RNNDataDesc", "arrayLengthRequested"},
+	"cudnnGetRNNDescriptor_v6":                                 {"handle", "rnnDesc"},
+	"cudnnGetRNNDescriptor_v8":                                 {"rnnDesc"},
+	"cudnnGetRNNLinLayerBiasParams":                            {"handle", "rnnDesc", "pseudoLayer", "xDesc", "wDesc", "w", "linLayerID"},
+	"cudnnGetRNNLinLayerMatrixParams":                          {"handle", "rnnDesc", "pseudoLayer", "xDesc", "wDesc", "w", "linLayerID"},
+	"cudnnGetRNNMatrixMathType":                                {"rnnDesc"},
+	"cudnnGetRNNPaddingMode":                                   {"*paddingMode"},
+	"cudnnGetRNNParamsSize":                                    {"handle", "rnnDesc", "xDesc", "dataType"},
+	"cudnnGetRNNProjectionLayers":                              {"handle", "rnnDesc"},
+	"cudnnGetRNNTempSpaceSizes":                                {"handle", "rnnDesc", "fMode", "xDesc"},
+	"cudnnGetRNNWeightParams":                                  {"handle", "rnnDesc", "pseudoLayer", "weightSpaceSize", "weightSpace", "linLayerID"},
+	"cudnnGetRNNWeightSpaceSize":                               {"handle", "rnnDesc"},
+	"cudnnGetRNNWorkspaceSize":                                 {"handle", "rnnDesc", "seqLength", "xDesc"},
+	"cudnnMultiHeadAttnForward":                                {"handle", "attnDesc", "currIdx", "loWinIdx[]", "hiWinIdx[]", "devSeqLengthsQO[]", "devSeqLengthsKV[]", "qDesc", "queries", "residuals", "kDesc", "keys", "vDesc", "values", "oDesc", "weightSizeInBytes", "weights", "workSpaceSizeInBytes", "reserveSpaceSizeInBytes"},
+	"cudnnRNNForward":                                          {"handle", "rnnDesc", "fwdMode", "devSeqLengths", "xDesc", "x", "yDesc", "hDesc", "hx", "cDesc", "cx", "weightSpaceSize", "weightSpace", "workSpaceSize", "reserveSpaceSize"},
+	"cudnnRNNForwardInference":                                 {"handle", "rnnDesc", "seqLength", "xDesc", "x", "hxDesc", "hx", "cxDesc", "cx", "wDesc", "w", "yDesc", "hyDesc", "cyDesc", "workspace", "workSpaceSizeInBytes"},
+	"cudnnRNNForwardInferenceEx":                               {"handle", "rnnDesc", "xDesc", "x", "hxDesc", "hx", "cxDesc", "cx", "wDesc", "w", "yDesc", "hyDesc", "cyDesc", "kDesc", "keys", "cDesc", "cAttn", "iDesc", "iAttn", "qDesc", "queries", "workspace", "workSpaceSizeInBytes"},
+	"cudnnRNNGetClip_v8":                                       {"rnnDesc"},
+	"cudnnRNNSetClip":                                          {"clipMode", "lclip", "rclip", "clipNanOpt"},
+	"cudnnRNNSetClip_v8":                                       {"rnnDesc", "clipMode", "clipNanOpt", "lclip", "rclip"},
+	"cudnnSetAttnDescriptor":                                   {"attnMode", "nHeads", "smScaler", "dataType", "computePrec", "mathType", "attnDropoutDesc", "postDropoutDesc", "qSize", "kSize", "vSize", "qProjSize", "kProjSize", "vProjSize", "oProjSize", "qoMaxSeqLength", "kvMaxSeqLength", "maxBatchSize", "maxBeamSize"},
+	"cudnnSetPersistentRNNPlan":                                {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_BAD_PARAM"},
+	"cudnnSetRNNBiasMode":                                      {"biasMode"},
+	"cudnnSetRNNDataDescriptor":                                {"dataType", "layout", "maxSeqLength", "batchSize", "vectorSize", "seqLengthArray", "paddingFill"},
+	"cudnnSetRNNDescriptor_v6":                                 {"handle", "hiddenSize", "numLayers", "dropoutDesc", "inputMode", "direction", "mode", "algo", "mathPrec"},
+	"cudnnSetRNNDescriptor_v8":                                 {"rnnDesc", "algo", "cellMode", "biasMode", "dirMode", "inputMode", "dataType", "mathPrec", "mathType", "inputSize", "hiddenSize", "projSize", "numLayers", "dropoutDesc", "auxFlags"},
+	"cudnnSetRNNMatrixMathType":                                {"rnnDesc", "mType"},
+	"cudnnSetRNNPaddingMode":                                   {"paddingMode"},
+	"cudnnSetRNNProjectionLayers":                              {"handle", "rnnDesc", "recProjSize", "outProjSize"},
+	"cudnnAdvTrainVersionCheck":                                {"CUDNN_STATUS_SUCCESS", "CUDNN_STATUS_VERSION_MISMATCH"},
+	"cudnnCTCLoss":                                             {"handle", "probsDesc", "probs", "hostLabels", "hostLabelLengths", "hostInputLengths", "gradientsDesc", "algo", "ctcLossDesc", "workspace", "sizeInBytes"},
+	"cudnnCTCLoss_v8":                                          {"handle", "algo", "ctcLossDesc", "probsDesc", "probs", "labels", "labelLengths", "inputLengths", "gradientsDesc", "workspace", "sizeInBytes"},
+	"cudnnDestroyCTCLossDescriptor":                            {"ctcLossDesc"},
+	"cudnnFindRNNBackwardDataAlgorithmEx":                      {"handle", "rnnDesc", "seqLength", "yDesc", "y", "dyDesc", "dy", "dhyDesc", "dhy", "dcyDesc", "dcy", "wDesc", "w", "hxDesc", "hx", "cxDesc", "cx", "dxDesc", "dhxDesc", "dcxDesc", "findIntensity", "requestedAlgoCount", "workspace", "workSpaceSizeInBytes", "reserveSpaceSizeInBytes"},
+	"cudnnFindRNNBackwardWeightsAlgorithmEx":                   {"handle", "rnnDesc", "seqLength", "xDesc", "x", "hxDesc", "hx", "yDesc", "y", "findIntensity", "requestedAlgoCount", "workspace", "workSpaceSizeInBytes", "dwDesc", "reserveSpace", "reserveSpaceSizeInBytes"},
+	"cudnnFindRNNForwardTrainingAlgorithmEx":                   {"handle", "rnnDesc", "xDesc", "seqLength", "x", "hxDesc", "hx", "cxDesc", "cx", "wDesc", "w", "yDesc", "hyDesc", "cyDesc", "findIntensity", "requestedAlgoCount", "workspace", "workSpaceSizeInBytes", "reserveSpaceSizeInBytes"},
+	"cudnnGetCTCLossDescriptor":                                {"ctcLossDesc"},
+	"cudnnGetCTCLossDescriptorEx":                              {"ctcLossDesc"},
+	"cudnnGetCTCLossDescriptor_v8":                             {"ctcLossDesc"},
+	"cudnnGetCTCLossWorkspaceSize":                             {"handle", "probsDesc", "gradientsDesc", "labels", "labelLengths", "inputLengths", "algo", "ctcLossDesc"},
+	"cudnnGetCTCLossWorkspaceSize_v8":                          {"handle", "algo", "ctcLossDesc", "probsDesc", "gradientsDesc"},
+	"cudnnGetRNNTrainingReserveSize":                           {"handle", "rnnDesc", "seqLength", "xDesc"},
+	"cudnnMultiHeadAttnBackwardData":                           {"handle", "attnDesc", "loWinIdx[]", "hiWinIdx[]", "devSeqLengthsDQDO[]", "devSeqLengthsDKDV[]", "doDesc", "dout", "dqDesc", "queries", "dkDesc", "keys", "dvDesc", "values", "weightSizeInBytes", "weights", "workSpaceSizeInBytes", "reserveSpaceSizeInBytes"},
+	"cudnnMultiHeadAttnBackwardWeights":                        {"handle", "attnDesc", "addGrad", "qDesc", "queries", "kDesc", "keys", "vDesc", "values", "doDesc", "dout", "weightSizeInBytes", "weights", "workSpaceSizeInBytes", "reserveSpaceSizeInBytes"},
+	"cudnnRNNBackwardData":                                     {"handle", "rnnDesc", "seqLength", "yDesc", "y", "dyDesc", "dy", "dhyDesc", "dhy", "dcyDesc", "dcy", "wDesc", "w", "hxDesc", "hx", "cxDesc", "cx", "dxDesc", "dhxDesc", "dcxDesc", "workspace", "workSpaceSizeInBytes", "reserveSpaceSizeInBytes"},
+	"cudnnRNNBackwardData_v8":                                  {"handle", "rnnDesc", "devSeqLengths", "yDesc", "y", "dy", "xDesc", "hDesc", "hx", "dhy", "cDesc", "cx", "dcy", "weightSpaceSize", "weightSpace", "workSpaceSize", "reserveSpaceSize"},
+	"cudnnRNNBackwardDataEx":                                   {"handle", "rnnDesc", "yDesc", "y", "dyDesc", "dy", "dhyDesc", "dhy", "dcyDesc", "dcy", "wDesc", "w", "hxDesc", "hx", "cxDesc", "cx", "dxDesc", "dhxDesc", "dcxDesc", "dkDesc", "dkeys", "workspace", "workSpaceSizeInBytes", "reserveSpaceSizeInBytes"},
+	"cudnnRNNBackwardWeights":                                  {"handle", "rnnDesc", "seqLength", "xDesc", "x", "hxDesc", "hx", "yDesc", "y", "workspace", "workSpaceSizeInBytes", "dwDesc", "reserveSpace", "reserveSpaceSizeInBytes"},
+	"cudnnRNNBackwardWeights_v8":                               {"handle", "rnnDesc", "addGrad", "devSeqLengths", "xDesc", "x", "hDesc", "hx", "yDesc", "weightSpaceSize", "workSpaceSize", "reserveSpaceSize"},
+	"cudnnRNNBackwardWeightsEx":                                {"handle", "rnnDesc", "xDesc", "x", "hxDesc", "hx", "yDesc", "y", "workspace", "workSpaceSizeInBytes", "dwDesc", "reserveSpace", "reserveSpaceSizeInBytes"},
+	"cudnnRNNForwardTraining":                                  {"handle", "rnnDesc", "seqLength", "xDesc", "x", "hxDesc", "hx", "cxDesc", "cx", "wDesc", "w", "yDesc", "hyDesc", "cyDesc", "workspace", "workSpaceSizeInBytes", "reserveSpaceSizeInBytes"},
+	"cudnnRNNForwardTrainingEx":                                {"handle", "rnnDesc", "xDesc", "x", "hxDesc", "hx", "cxDesc", "cx", "wDesc", "w", "yDesc", "hyDesc", "cyDesc", "kDesc", "keys", "cDesc", "cAttn", "iDesc", "iAttn", "qDesc", "queries", "workspace", "workSpaceSizeInBytes", "reserveSpaceSizeInBytes"},
+	"cudnnSetCTCLossDescriptor":                                {"compType"},
+	"cudnnSetCTCLossDescriptorEx":                              {"compType", "normMode", "gradMode"},
+	"cudnnSetCTCLossDescriptor_v8":                             {"compType", "normMode", "gradMode", "maxLabelLength"},
 }
 var outputParams = map[string][]string{
-	"cudnnActivationBackward":                            {"dx"},
-	"cudnnActivationForward":                             {"y"},
-	"cudnnBatchNormalizationBackward":                    {"resultBnScaleDiff", "resultBnBiasDiff"},
-	"cudnnBatchNormalizationForwardTraining":             {"resultSaveMean", "resultSaveInvVariance"},
-	"cudnnCTCLoss":                                       {"costs", "gradients"},
-	"cudnnConvolutionBackwardBias":                       {"db"},
-	"cudnnCreate":                                        {"handle"},
-	"cudnnCreateCTCLossDescriptor":                       {"ctcLossDesc"},
-	"cudnnCreateOpTensorDescriptor":                      {"opTensorDesc"},
-	"cudnnDeriveBNTensorDescriptor":                      {"derivedBnDesc"},
-	"cudnnDivisiveNormalizationBackward":                 {"dx", "dMeans"},
-	"cudnnDivisiveNormalizationForward":                  {"y"},
-	"cudnnDropoutBackward":                               {"dx"},
-	"cudnnDropoutForward":                                {"y", "reserveSpace"},
-	"cudnnDropoutGetReserveSpaceSize":                    {"sizeInBytes"},
-	"cudnnDropoutGetStatesSize":                          {"sizeInBytes"},
-	"cudnnFindConvolutionBackwardDataAlgorithm":          {"returnedAlgoCount", "perfResults"},
-	"cudnnFindConvolutionBackwardDataAlgorithmEx":        {"returnedAlgoCount", "perfResults"},
-	"cudnnFindConvolutionBackwardFilterAlgorithm":        {"returnedAlgoCount", "perfResults"},
-	"cudnnFindConvolutionBackwardFilterAlgorithmEx":      {"returnedAlgoCount", "perfResults"},
-	"cudnnFindConvolutionForwardAlgorithm":               {"returnedAlgoCount", "perfResults"},
-	"cudnnFindConvolutionForwardAlgorithmEx":             {"returnedAlgoCount", "perfResults"},
-	"cudnnFindRNNBackwardDataAlgorithmEx":                {"dx", "dhx", "dcx", "returnedAlgoCount", "perfResults"},
-	"cudnnFindRNNBackwardWeightsAlgorithmEx":             {"returnedAlgoCount", "perfResults"},
-	"cudnnFindRNNForwardInferenceAlgorithmEx":            {"y", "hy", "cy", "returnedAlgoCount", "perfResults"},
-	"cudnnFindRNNForwardTrainingAlgorithmEx":             {"y", "hy", "cy", "returnedAlgoCount", "perfResults"},
-	"cudnnGetActivationDescriptor":                       {"mode", "reluNanOpt", "coef"},
-	"cudnnGetAlgorithmPerformance":                       {"algoDesc", "status", "timecoef", "memory"},
-	"cudnnGetCTCLossDescriptor":                          {"compType"},
-	"cudnnGetCTCLossWorkspaceSize":                       {"sizeInBytes"},
-	"cudnnGetCallback":                                   {"mask", "udata", "fptr"},
-	"cudnnGetConvolution2dDescriptor":                    {"pad_h", "pad_w", "u", "v", "dilation_h", "dilation_w", "mode", "computeType"},
-	"cudnnGetConvolution2dForwardOutputDim":              {"n", "c", "h", "w"},
-	"cudnnGetConvolutionBackwardDataAlgorithm":           {"algo"},
-	"cudnnGetConvolutionBackwardDataAlgorithmMaxCount":   {"count"},
-	"cudnnGetConvolutionBackwardDataAlgorithm_v7":        {"returnedAlgoCount", "perfResults"},
-	"cudnnGetConvolutionBackwardDataWorkspaceSize":       {"sizeInBytes"},
-	"cudnnGetConvolutionBackwardFilterAlgorithm":         {"algo"},
-	"cudnnGetConvolutionBackwardFilterAlgorithmMaxCount": {"count"},
-	"cudnnGetConvolutionBackwardFilterAlgorithm_v7":      {"returnedAlgoCount", "perfResults"},
-	"cudnnGetConvolutionBackwardFilterWorkspaceSize":     {"sizeInBytes"},
-	"cudnnGetConvolutionForwardAlgorithm":                {"algo"},
-	"cudnnGetConvolutionForwardAlgorithmMaxCount":        {"count"},
-	"cudnnGetConvolutionForwardAlgorithm_v7":             {"returnedAlgoCount", "perfResults"},
-	"cudnnGetConvolutionForwardWorkspaceSize":            {"sizeInBytes"},
-	"cudnnGetConvolutionNdDescriptor":                    {"arrayLength", "padA", "filterStrideA", "dilationA", "mode", "datatype"},
-	"cudnnGetConvolutionNdForwardOutputDim":              {"tensorOuputDimA"},
-	"cudnnGetDropoutDescriptor":                          {"dropout", "states", "seed"},
-	"cudnnGetFilter4dDescriptor":                         {"datatype", "format", "k", "c", "h", "w"},
-	"cudnnGetFilterNdDescriptor":                         {"datatype", "format", "nbDims", "filterDimA"},
-	"cudnnGetLRNDescriptor":                              {"normDesc", "lrnN", "lrnAlpha", "lrnBeta", "lrnK"},
-	"cudnnGetOpTensorDescriptor":                         {"opTensorOp", "opTensorCompType", "opTensorNanOpt"},
-	"cudnnGetPooling2dDescriptor":                        {"mode", "maxpoolingNanOpt", "windowHeight", "windowWidth", "verticalPadding", "horizontalPadding", "verticalStride", "horizontalStride"},
-	"cudnnGetPooling2dForwardOutputDim":                  {"n", "c", "h", "w"}, // docs on the internet has capitalized retVals
-	"cudnnGetPoolingNdDescriptor":                        {"mode", "nbDims", "windowDimA", "paddingA", "strideA"},
-	"cudnnGetPoolingNdForwardOutputDim":                  {"outDimA"},
-	"cudnnGetProperty":                                   {"value"},
-	"cudnnGetRNNDataDescriptor":                          {"dataType", "layout", "maxSeqLength", "batchSize", "vectorSize", "seqLengthArray", "paddingFill"},
-	"cudnnGetRNNDescriptor":                              {"hiddenSize", "numLayers", "dropoutDesc", "inputMode", "direction", "mode", "algo", "dataType"},
-	"cudnnGetRNNLinLayerBiasParams":                      {"linLayerBiasDesc", "linLayerBias"},
-	"cudnnGetRNNLinLayerMatrixParams":                    {"linLayerMatDesc", "linLayerMat"},
-	"cudnnGetRNNParamsSize":                              {"sizeInBytes"},
-	"cudnnGetRNNProjectionLayers":                        {"recProjSize", "outProjSize"},
-	"cudnnGetRNNTrainingReserveSize":                     {"sizeInBytes"},
-	"cudnnGetRNNWorkspaceSize":                           {"sizeInBytes"},
-	"cudnnGetReduceTensorDescriptor":                     {"reduceTensorOp", "reduceTensorCompType", "reduceTensorIndices", "reduceTensorIndicesType"},
-	"cudnnGetReductionIndicesSize":                       {"sizeInBytes"},
-	"cudnnGetReductionWorkspaceSize":                     {"sizeInBytes"},
-	"cudnnGetStream":                                     {"streamID"},
-	"cudnnGetTensor4dDescriptor":                         {"datatype", "n", "c", "h", "w", "nStride", "cStride", "hStride", "wStride"},
-	"cudnnGetTensorNdDescriptor":                         {"datatype", "nbDims", "dimA"},
-	"cudnnGetTensorSizeInBytes":                          {"size"},
-	"cudnnIm2Col":                                        {"colBuffer"},
-	"cudnnLRNCrossChannelBackward":                       {"dxDesc", "dx"},
-	"cudnnLRNCrossChannelForward":                        {"y"},
-	"cudnnPoolingBackward":                               {"dx"},
-	"cudnnPoolingForward":                                {"y"},
-	"cudnnQueryRuntimeError":                             {"rstatus"},
-	"cudnnRNNBackwardData":                               {"dx", "dhx", "dcx"},
-	"cudnnRNNBackwardDataEx":                             {"dx", "dhx", "dcx"},
-	"cudnnRNNForwardInference":                           {"y", "hy", "cy"},
-	"cudnnRNNForwardInferenceEx":                         {"y", "hy", "cy"},
-	"cudnnRNNForwardTraining":                            {"y", "hy", "cy"},
-	"cudnnRNNForwardTrainingEx":                          {"y", "hy", "cy"},
-	"cudnnRNNGetClip":                                    {"*clipMode", "*lclip", "*rclip", "*clipNanOpt"},
-	"cudnnReduceTensor":                                  {"indices"},
-	"cudnnSetCTCLossDescriptor":                          {"ctcLossDesc"},
-	"cudnnSetDropoutDescriptor":                          {"states"},
-	"cudnnSetLRNDescriptor":                              {"normDesc"},
-	"cudnnSetOpTensorDescriptor":                         {"opTensorDesc"},
-	"cudnnSetPoolingNdDescriptor":                        {"windowDimA", "paddingA", "strideA"},
-	"cudnnSetTensorNdDescriptorEx":                       {"tensorDesc"},
-	"cudnnSoftmaxBackward":                               {"dx"},
-	"cudnnSoftmaxForward":                                {"y"},
-	"cudnnSpatialTfGridGeneratorBackward":                {"dtheta"},
-	"cudnnSpatialTfGridGeneratorForward":                 {"grid"},
-	"cudnnSpatialTfSamplerBackward":                      {"dx", "dgrid"},
-	"cudnnSpatialTfSamplerForward":                       {"y"},
-	"cudnnTransformTensor":                               {"y"},
+	"cudnnActivationForward":                                   {"y"},
+	"cudnnCreate":                                              {"handle"},
+	"cudnnCreateOpTensorDescriptor":                            {"opTensorDesc"},
+	"cudnnCreateTensorTransformDescriptor":                     {"transformDesc"},
+	"cudnnDeriveBNTensorDescriptor":                            {"derivedBnDesc"},
+	"cudnnDeriveNormTensorDescriptor":                          {"derivedNormScaleBiasDesc", "derivedNormMeanVarDesc"},
+	"cudnnDivisiveNormalizationForward":                        {"y"},
+	"cudnnDropoutForward":                                      {"y", "reserveSpace"},
+	"cudnnDropoutGetReserveSpaceSize":                          {"sizeInBytes"},
+	"cudnnDropoutGetStatesSize":                                {"sizeInBytes"},
+	"cudnnGetActivationDescriptor":                             {"mode", "reluNanOpt", "coef"},
+	"cudnnGetAlgorithmPerformance":                             {"algoDesc", "status", "timecoef", "memory"},
+	"cudnnGetAlgorithmSpaceSize":                               {"algoSpaceSizeInBytes"},
+	"cudnnGetCallback":                                         {"mask", "udata", "fptr"},
+	"cudnnGetDropoutDescriptor":                                {"dropout", "states", "seed"},
+	"cudnnGetFilter4dDescriptor":                               {"datatype", "format", "k", "c", "h", "w"},
+	"cudnnGetFilterNdDescriptor":                               {"datatype", "format", "nbDims", "filterDimA"},
+	"cudnnGetFilterSizeInBytes":                                {"size"},
+	"cudnnGetLRNDescriptor":                                    {"normDesc", "lrnN", "lrnAlpha", "lrnBeta", "lrnK"},
+	"cudnnGetOpTensorDescriptor":                               {"opTensorOp", "opTensorCompType", "opTensorNanOpt"},
+	"cudnnGetPooling2dDescriptor":                              {"mode", "maxpoolingNanOpt", "windowHeight", "windowWidth", "verticalPadding", "horizontalPadding", "verticalStride", "horizontalStride"},
+	"cudnnGetPooling2dForwardOutputDim":                        {"N", "C", "H", "W"},
+	"cudnnGetPoolingNdDescriptor":                              {"mode", "nbDims", "windowDimA", "paddingA", "strideA"},
+	"cudnnGetPoolingNdForwardOutputDim":                        {"outDimA"},
+	"cudnnGetProperty":                                         {"value"},
+	"cudnnGetReduceTensorDescriptor":                           {"reduceTensorOp", "reduceTensorCompType", "reduceTensorIndices", "reduceTensorIndicesType"},
+	"cudnnGetReductionIndicesSize":                             {"sizeInBytes"},
+	"cudnnGetReductionWorkspaceSize":                           {"sizeInBytes"},
+	"cudnnGetStream":                                           {"streamID"},
+	"cudnnGetTensor4dDescriptor":                               {"datatype", "n", "c", "h", "w", "nStride", "cStride", "hStride", "wStride"},
+	"cudnnGetTensorNdDescriptor":                               {"datatype", "nbDims", "dimA", "strideA"},
+	"cudnnGetTensorSizeInBytes":                                {"size"},
+	"cudnnGetTensorTransformDescriptor":                        {"destFormat", "padBeforeA[]", "padAfterA[]", "foldA[]", "direction"},
+	"cudnnInitTransformDest":                                   {"destDesc", "destSizeInBytes"},
+	"cudnnLRNCrossChannelForward":                              {"y"},
+	"cudnnNormalizationForwardInference":                       {"*y"},
+	"cudnnPoolingForward":                                      {"y"},
+	"cudnnQueryRuntimeError":                                   {"rstatus"},
+	"cudnnReduceTensor":                                        {"indices"},
+	"cudnnSetDropoutDescriptor":                                {"states"},
+	"cudnnSetLRNDescriptor":                                    {"normDesc"},
+	"cudnnSetOpTensorDescriptor":                               {"opTensorDesc"},
+	"cudnnSetTensorNdDescriptorEx":                             {"tensorDesc"},
+	"cudnnSetTensorTransformDescriptor":                        {"transformDesc"},
+	"cudnnSoftmaxForward":                                      {"y"},
+	"cudnnSpatialTfGridGeneratorForward":                       {"grid"},
+	"cudnnSpatialTfSamplerForward":                             {"y"},
+	"cudnnTransformTensor":                                     {"y"},
+	"cudnnActivationBackward":                                  {"dx"},
+	"cudnnBatchNormalizationBackward":                          {"resultBnScaleDiff", "resultBnBiasDiff"},
+	"cudnnBatchNormalizationBackwardEx":                        {"dzDesc", "*dzData", "dxDesc", "*dxData"},
+	"cudnnBatchNormalizationForwardTraining":                   {"resultSaveMean", "resultSaveInvVariance"},
+	"cudnnBatchNormalizationForwardTrainingEx":                 {"*saveMean", "*saveInvVariance"},
+	"cudnnDivisiveNormalizationBackward":                       {"dx", "dMeans"},
+	"cudnnDropoutBackward":                                     {"dx"},
+	"cudnnGetBatchNormalizationBackwardExWorkspaceSize":        {"*sizeInBytes"},
+	"cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize": {"*sizeInBytes"},
+	"cudnnGetBatchNormalizationTrainingExReserveSpaceSize":     {"*sizeInBytes"},
+	"cudnnGetNormalizationBackwardWorkspaceSize":               {"*sizeInBytes"},
+	"cudnnGetNormalizationForwardTrainingWorkspaceSize":        {"*sizeInBytes"},
+	"cudnnGetNormalizationTrainingReserveSpaceSize":            {"*sizeInBytes"},
+	"cudnnLRNCrossChannelBackward":                             {"dxDesc", "dx"},
+	"cudnnNormalizationBackward":                               {"dzDesc", "*dzData", "dxDesc", "*dxData"},
+	"cudnnNormalizationForwardTraining":                        {"*yData", "*resultSaveMean", "*resultSaveInvVariance"},
+	"cudnnPoolingBackward":                                     {"dx"},
+	"cudnnSoftmaxBackward":                                     {"dx"},
+	"cudnnSpatialTfGridGeneratorBackward":                      {"dtheta"},
+	"cudnnSpatialTfSamplerBackward":                            {"dx", "dgrid"},
+	"cudnnFindConvolutionBackwardDataAlgorithm":                {"returnedAlgoCount", "perfResults"},
+	"cudnnFindConvolutionBackwardDataAlgorithmEx":              {"returnedAlgoCount", "perfResults"},
+	"cudnnFindConvolutionForwardAlgorithm":                     {"returnedAlgoCount", "perfResults"},
+	"cudnnFindConvolutionForwardAlgorithmEx":                   {"returnedAlgoCount", "perfResults"},
+	"cudnnGetConvolution2dDescriptor":                          {"pad_h", "pad_w", "u", "v", "dilation_h", "dilation_w", "mode", "computeType"},
+	"cudnnGetConvolution2dForwardOutputDim":                    {"n", "c", "h", "w"},
+	"cudnnGetConvolutionBackwardDataAlgorithmMaxCount":         {"count"},
+	"cudnnGetConvolutionBackwardDataAlgorithm_v7":              {"returnedAlgoCount", "perfResults"},
+	"cudnnGetConvolutionBackwardDataWorkspaceSize":             {"sizeInBytes"},
+	"cudnnGetConvolutionForwardAlgorithmMaxCount":              {"count"},
+	"cudnnGetConvolutionForwardAlgorithm_v7":                   {"returnedAlgoCount", "perfResults"},
+	"cudnnGetConvolutionForwardWorkspaceSize":                  {"sizeInBytes"},
+	"cudnnGetConvolutionNdDescriptor":                          {"arrayLength", "padA", "filterStrideA", "dilationA", "mode", "datatype"},
+	"cudnnGetConvolutionNdForwardOutputDim":                    {"tensorOuputDimA"},
+	"cudnnGetConvolutionReorderType":                           {"reorderType"},
+	"cudnnGetFoldedConvBackwardDataDescriptors":                {"foldedFilterDesc", "paddedDiffDesc", "foldedConvDesc", "foldedGradDesc", "filterFoldTransDesc", "diffPadTransDesc", "gradFoldTransDesc", "gradUnfoldTransDesc"},
+	"cudnnIm2Col":                                              {"colBuffer"},
+	"cudnnConvolutionBackwardBias":                             {"db"},
+	"cudnnFindConvolutionBackwardFilterAlgorithm":              {"returnedAlgoCount", "perfResults"},
+	"cudnnFindConvolutionBackwardFilterAlgorithmEx":            {"returnedAlgoCount", "perfResults"},
+	"cudnnGetConvolutionBackwardFilterAlgorithmMaxCount":       {"count"},
+	"cudnnGetConvolutionBackwardFilterAlgorithm_v7":            {"returnedAlgoCount", "perfResults"},
+	"cudnnGetConvolutionBackwardFilterWorkspaceSize":           {"sizeInBytes"},
+	"cudnnGetFusedOpsVariantParamPackAttribute":                {"ptr"},
+	"cudnnMakeFusedOpsPlan":                                    {"workspaceSizeInBytes"},
+	"cudnnFindRNNForwardInferenceAlgorithmEx":                  {"y", "hy", "cy", "returnedAlgoCount", "perfResults"},
+	"cudnnGetAttnDescriptor":                                   {"attnMode", "nHeads", "smScaler", "dataType", "computePrec", "mathType", "attnDropoutDesc", "postDropoutDesc", "qSize", "kSize", "vSize", "qProjSize", "kProjSize", "vProjSize", "oProjSize", "qoMaxSeqLength", "kvMaxSeqLength", "maxBatchSize", "maxBeamSize"},
+	"cudnnGetMultiHeadAttnBuffers":                             {"weightSizeInBytes", "workSpaceSizeInBytes", "reserveSpaceSizeInBytes"},
+	"cudnnGetMultiHeadAttnWeights":                             {"wDesc", "wAddr"},
+	"cudnnGetRNNBiasMode":                                      {"*biasMode"},
+	"cudnnGetRNNDataDescriptor":                                {"dataType", "layout", "maxSeqLength", "batchSize", "vectorSize", "seqLengthArray", "paddingFill"},
+	"cudnnGetRNNDescriptor_v6":                                 {"hiddenSize", "numLayers", "dropoutDesc", "inputMode", "direction", "mode", "algo", "mathPrec"},
+	"cudnnGetRNNDescriptor_v8":                                 {"algo", "cellMode", "biasMode", "dirMode", "inputMode", "dataType", "mathPrec", "mathType", "inputSize", "hiddenSize", "projSize", "numLayers", "dropoutDesc", "auxFlags"},
+	"cudnnGetRNNLinLayerBiasParams":                            {"linLayerBiasDesc", "linLayerBias"},
+	"cudnnGetRNNLinLayerMatrixParams":                          {"linLayerMatDesc", "linLayerMat"},
+	"cudnnGetRNNMatrixMathType":                                {"mType"},
+	"cudnnGetRNNParamsSize":                                    {"sizeInBytes"},
+	"cudnnGetRNNProjectionLayers":                              {"recProjSize", "outProjSize"},
+	"cudnnGetRNNTempSpaceSizes":                                {"workSpaceSize", "reserveSpaceSize"},
+	"cudnnGetRNNWeightParams":                                  {"mDesc", "mAddr", "bDesc", "bAddr"},
+	"cudnnGetRNNWeightSpaceSize":                               {"weightSpaceSize"},
+	"cudnnGetRNNWorkspaceSize":                                 {"sizeInBytes"},
+	"cudnnMultiHeadAttnForward":                                {"out"},
+	"cudnnRNNForward":                                          {"y", "hy", "cy"},
+	"cudnnRNNForwardInference":                                 {"y", "hy", "cy"},
+	"cudnnRNNForwardInferenceEx":                               {"y", "hy", "cy"},
+	"cudnnRNNGetClip":                                          {"*clipMode", "*lclip", "*rclip", "*clipNanOpt"},
+	"cudnnRNNGetClip_v8":                                       {"clipMode", "clipNanOpt", "lclip", "rclip"},
+	"cudnnSetAttnDescriptor":                                   {"attnDesc"},
+	"cudnnCreateCTCLossDescriptor":                             {"ctcLossDesc"},
+	"cudnnCTCLoss":                                             {"costs", "gradients"},
+	"cudnnCTCLoss_v8":                                          {"costs", "gradients"},
+	"cudnnFindRNNBackwardDataAlgorithmEx":                      {"dx", "dhx", "dcx", "returnedAlgoCount", "perfResults"},
+	"cudnnFindRNNBackwardWeightsAlgorithmEx":                   {"returnedAlgoCount", "perfResults"},
+	"cudnnFindRNNForwardTrainingAlgorithmEx":                   {"y", "hy", "cy", "returnedAlgoCount", "perfResults"},
+	"cudnnGetCTCLossDescriptor":                                {"compType"},
+	"cudnnGetCTCLossDescriptorEx":                              {"compType", "normMode", "gradMode"},
+	"cudnnGetCTCLossDescriptor_v8":                             {"compType", "normMode", "gradMode", "maxLabelLength"},
+	"cudnnGetCTCLossWorkspaceSize":                             {"sizeInBytes"},
+	"cudnnGetCTCLossWorkspaceSize_v8":                          {"sizeInBytes"},
+	"cudnnGetRNNTrainingReserveSize":                           {"sizeInBytes"},
+	"cudnnMultiHeadAttnBackwardData":                           {"dqueries", "dkeys", "dvalues"},
+	"cudnnMultiHeadAttnBackwardWeights":                        {"dweights"},
+	"cudnnRNNBackwardData":                                     {"dx", "dhx", "dcx"},
+	"cudnnRNNBackwardData_v8":                                  {"dx", "dhx", "dcx"},
+	"cudnnRNNBackwardDataEx":                                   {"dx", "dhx", "dcx"},
+	"cudnnRNNBackwardWeights_v8":                               {"y", "dweightSpace"},
+	"cudnnRNNForwardTraining":                                  {"y", "hy", "cy"},
+	"cudnnRNNForwardTrainingEx":                                {"y", "hy", "cy"},
+	"cudnnSetCTCLossDescriptor":                                {"ctcLossDesc"},
+	"cudnnSetCTCLossDescriptorEx":                              {"ctcLossDesc"},
+	"cudnnSetCTCLossDescriptor_v8":                             {"ctcLossDesc"},
 }
 var ioParams = map[string][]string{
 	"cudnnAddTensor":                                {"C"},
-	"cudnnConvolutionBackwardData":                  {"dx"},
-	"cudnnConvolutionBackwardFilter":                {"dw"},
-	"cudnnConvolutionBiasActivationForward":         {"y"},
-	"cudnnConvolutionForward":                       {"y"},
-	"cudnnFindConvolutionBackwardDataAlgorithmEx":   {"dxDesc"},
-	"cudnnFindConvolutionBackwardFilterAlgorithmEx": {"dw"},
-	"cudnnFindConvolutionForwardAlgorithmEx":        {"y"},
-	"cudnnFindRNNBackwardDataAlgorithmEx":           {"reserveSpace"},
-	"cudnnFindRNNBackwardWeightsAlgorithmEx":        {"dw"},
-	"cudnnFindRNNForwardTrainingAlgorithmEx":        {"reserveSpace"},
 	"cudnnGetAlgorithmPerformance":                  {"algoPerf"},
-	"cudnnGetConvolution2dDescriptor":               {"convDesc"},
-	"cudnnGetConvolutionNdDescriptor":               {"convDesc"},
-	"cudnnGetRNNPaddingMode":                        {"rnnDesc"},
 	"cudnnOpTensor":                                 {"C"},
 	"cudnnQueryRuntimeError":                        {"tag"},
-	"cudnnRNNBackwardData":                          {"reserveSpace"},
-	"cudnnRNNBackwardDataEx":                        {"reserveSpace"},
-	"cudnnRNNBackwardWeights":                       {"dw"},
-	"cudnnRNNBackwardWeightsEx":                     {"dw"},
-	"cudnnRNNForwardTraining":                       {"reserveSpace"},
-	"cudnnRNNForwardTrainingEx":                     {"reserveSpace"},
 	"cudnnReduceTensor":                             {"C"},
 	"cudnnRestoreDropoutDescriptor":                 {"dropoutDesc"},
 	"cudnnScaleTensor":                              {"y"},
 	"cudnnSetActivationDescriptor":                  {"activationDesc"},
 	"cudnnSetAlgorithmDescriptor":                   {"algorithmDesc"},
 	"cudnnSetAlgorithmPerformance":                  {"algoPerf"},
-	"cudnnSetConvolution2dDescriptor":               {"convDesc"},
-	"cudnnSetConvolutionNdDescriptor":               {"convDesc"},
 	"cudnnSetDropoutDescriptor":                     {"dropoutDesc"},
 	"cudnnSetFilter4dDescriptor":                    {"filterDesc"},
 	"cudnnSetFilterNdDescriptor":                    {"filterDesc"},
 	"cudnnSetPooling2dDescriptor":                   {"poolingDesc"},
 	"cudnnSetPoolingNdDescriptor":                   {"poolingDesc"},
-	"cudnnSetRNNDataDescriptor":                     {"RNNDataDesc"},
-	"cudnnSetRNNDescriptor":                         {"rnnDesc"},
-	"cudnnSetRNNDescriptor_v5":                      {"rnnDesc"},
-	"cudnnSetRNNDescriptor_v6":                      {"rnnDesc"},
-	"cudnnSetRNNPaddingMode":                        {"rnnDesc"},
 	"cudnnSetReduceTensorDescriptor":                {"reduceTensorDesc"},
 	"cudnnSetSpatialTransformerNdDescriptor":        {"stDesc"},
 	"cudnnSetTensor":                                {"y"},
 	"cudnnSetTensor4dDescriptor":                    {"tensorDesc"},
 	"cudnnSetTensor4dDescriptorEx":                  {"tensorDesc"},
 	"cudnnSetTensorNdDescriptor":                    {"tensorDesc"},
+	"cudnnConvolutionBackwardData":                  {"dx"},
+	"cudnnConvolutionBiasActivationForward":         {"y"},
+	"cudnnConvolutionForward":                       {"y"},
+	"cudnnFindConvolutionBackwardDataAlgorithmEx":   {"dxDesc"},
+	"cudnnFindConvolutionForwardAlgorithmEx":        {"y"},
+	"cudnnGetConvolution2dDescriptor":               {"convDesc"},
+	"cudnnGetConvolutionNdDescriptor":               {"convDesc"},
+	"cudnnSetConvolution2dDescriptor":               {"convDesc"},
+	"cudnnSetConvolutionNdDescriptor":               {"convDesc"},
+	"cudnnConvolutionBackwardFilter":                {"dw"},
+	"cudnnFindConvolutionBackwardFilterAlgorithmEx": {"dw"},
+	"cudnnGetFusedOpsConstParamPackAttribute":       {"isNULL"},
+	"cudnnGetRNNPaddingMode":                        {"rnnDesc"},
+	"cudnnMultiHeadAttnForward":                     {"workSpace", "reserveSpace"},
+	"cudnnRNNForward":                               {"workSpace", "reserveSpace"},
+	"cudnnSetRNNDataDescriptor":                     {"RNNDataDesc"},
+	"cudnnSetRNNDescriptor_v6":                      {"rnnDesc"},
+	"cudnnSetRNNPaddingMode":                        {"rnnDesc"},
+	"cudnnFindRNNBackwardDataAlgorithmEx":           {"reserveSpace"},
+	"cudnnFindRNNBackwardWeightsAlgorithmEx":        {"dw"},
+	"cudnnFindRNNForwardTrainingAlgorithmEx":        {"reserveSpace"},
+	"cudnnMultiHeadAttnBackwardData":                {"workSpace", "reserveSpace"},
+	"cudnnMultiHeadAttnBackwardWeights":             {"workSpace", "reserveSpace"},
+	"cudnnRNNBackwardData":                          {"reserveSpace"},
+	"cudnnRNNBackwardData_v8":                       {"workSpace", "reserveSpace"},
+	"cudnnRNNBackwardDataEx":                        {"reserveSpace"},
+	"cudnnRNNBackwardWeights":                       {"dw"},
+	"cudnnRNNBackwardWeights_v8":                    {"workSpace", "reserveSpace"},
+	"cudnnRNNBackwardWeightsEx":                     {"dw"},
+	"cudnnRNNForwardTraining":                       {"reserveSpace"},
+	"cudnnRNNForwardTrainingEx":                     {"reserveSpace"},
 }
 var docs = map[string]string{
-	"cudnnActivationBackward":                            "cudnnActivationBackward computes the gradient of a neuron activation function.",
-	"cudnnActivationForward":                             "cudnnActivationForward applies a specified neuron activation function element-wise over each input value.",
-	"cudnnAddTensor":                                     "cudnnAddTensor adds the scaled values of a bias tensor to another tensor. Each dimension of the bias tensor A must match the corresponding dimension of the destination tensor C or must be equal to 1. In the latter case, the same value from the bias tensor for those dimensions will be used to blend into the C tensor.",
-	"cudnnBatchNormalizationBackward":                    "cudnnBatchNormalizationBackward performs the backward BatchNormalization layer computation.",
-	"cudnnBatchNormalizationForwardInference":            "cudnnBatchNormalizationForwardInference performs the forward BatchNormalization layer computation for inference phase. cudnnBatchNormalizationForwardInference layer is based on the paper `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift`, S. Ioffe, C. Szegedy, 2015.",
-	"cudnnBatchNormalizationForwardTraining":             "cudnnBatchNormalizationForwardTraining performs the forward BatchNormalization layer computation for training phase.",
-	"cudnnCTCLoss":                                       "cudnnCTCLoss returns the ctc costs and gradients, given the probabilities and labels.",
-	"cudnnConvolutionBackwardBias":                       "cudnnConvolutionBackwardBias computes the convolution function gradient with respect to the bias, which is the sum of every element belonging to the same feature map across all of the images of the input tensor. Therefore, the number of elements produced is equal to the number of features maps of the input tensor.",
-	"cudnnConvolutionBackwardData":                       "cudnnConvolutionBackwardData computes the convolution gradient with respect to the output tensor using the specified algo, returning results in gradDesc. Scaling factors alpha and beta can be used to scale the input tensor and the output tensor respectively.",
-	"cudnnConvolutionBackwardFilter":                     "cudnnConvolutionBackwardFilter computes the convolution gradient with respect to filter coefficients using the specified algo, returning results in gradDesc.Scaling factors alpha and beta can be used to scale the input tensor and the output tensor respectively.",
-	"cudnnConvolutionBiasActivationForward":              "cudnnConvolutionBiasActivationForward applies a bias and then an activation to the convolutions or cross-correlations of cudnnConvolutionForward(), returning results in y. The full computation follows the equation y = act ( alpha1 * conv(x) + alpha2 * z + bias ).",
-	"cudnnConvolutionForward":                            "cudnnConvolutionForward executes convolutions or cross-correlations over x using filters specified with w, returning results in y. Scaling factors alpha and beta can be used to scale the input tensor and the output tensor respectively.",
-	"cudnnCreate":                                        "cudnnCreate initializes the cuDNN library and creates a handle to an opaque structure holding the cuDNN library context. It allocates hardware resources on the host and device and must be called prior to making any other cuDNN library calls. The cuDNN library handle is tied to the current CUDA device (context). To use the library on multiple devices, one cuDNN handle needs to be created for each device. For a given device, multiple cuDNN handles with different configurations (e.g., different current CUDA streams) may be created. Because cudnnCreate allocates some internal resources, the release of those resources by calling cudnnDestroy will implicitly call cudaDeviceSynchronize; therefore, the recommended best practice is to call cudnnCreate/cudnnDestroy outside of performance-critical code paths. For multithreaded applications that use the same device from different threads, the recommended programming model is to create one (or a few, as is convenient) cuDNN handle(s) per thread and use that cuDNN handle for the entire life of the thread.",
-	"cudnnCreateCTCLossDescriptor":                       "cudnnCreateCTCLossDescriptor creates a CTC loss function descriptor. .",
-	"cudnnCreateOpTensorDescriptor":                      "cudnnCreateOpTensorDescriptor creates a Tensor Pointwise math descriptor.",
-	"cudnnCreateTensorDescriptor":                        "cudnnCreateTensorDescriptor creates a generic tensor descriptor object by allocating the memory needed to hold its opaque structure. The data is initialized to be all zero.",
-	"cudnnDeriveBNTensorDescriptor":                      "Derives a secondary tensor descriptor for BatchNormalization scale, invVariance, bnBias, bnScale subtensors from the layer's x data descriptor. Use the tensor descriptor produced by this function as the bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc parameters in Spatial and Per-Activation Batch Normalization forward and backward functions. Resulting dimensions will be 1xC(x1)x1x1 for BATCHNORM_MODE_SPATIAL and 1xC(xD)xHxW for BATCHNORM_MODE_PER_ACTIVATION (parentheses for 5D). For HALF input data type the resulting tensor descriptor will have a FLOAT type. For other data types it will have the same type as the input data.",
-	"cudnnDestroy":                                       "cudnnDestroy releases resources used by the cuDNN handle. cudnnDestroy is usually the last call with a particular handle to the cuDNN handle. Because cudnnCreate allocates some internal resources, the release of those resources by calling cudnnDestroy will implicitly call cudaDeviceSynchronize; therefore, the recommended best practice is to call cudnnCreate/cudnnDestroy outside of performance-critical code paths.",
-	"cudnnDestroyCTCLossDescriptor":                      "cudnnDestroyCTCLossDescriptor destroys a CTC loss function descriptor object.",
-	"cudnnDestroyOpTensorDescriptor":                     "cudnnDestroyOpTensorDescriptor deletes a Tensor Pointwise math descriptor object.",
-	"cudnnDestroyReduceTensorDescriptor":                 "cudnnDestroyReduceTensorDescriptor destroys a previously created reduce tensor descriptor object. When the input pointer is NULL, this function performs no destroy operation.",
-	"cudnnDestroyTensorDescriptor":                       "cudnnDestroyTensorDescriptor destroys a previously created tensor descriptor object. When the input pointer is NULL, this function performs no destroy operation.",
-	"cudnnDivisiveNormalizationBackward":                 "cudnnDivisiveNormalizationBackward performs the backward DivisiveNormalization layer computation.",
-	"cudnnDivisiveNormalizationForward":                  "cudnnDivisiveNormalizationForward performs the forward spatial DivisiveNormalization layer computation. It divides every value in a layer by the standard deviation of it's spatial neighbors as described in `What is the Best Multi-Stage Architecture for Object Recognition`, Jarrett 2009, Local Contrast Normalization Layer section. Note that Divisive Normalization only implements the x/max(c, sigma_x) portion of the computation, where sigma_x is the variance over the spatial neighborhood of x. The full LCN (Local Contrastive Normalization) computation can be implemented as a two-step process:",
-	"cudnnDropoutBackward":                               "cudnnDropoutBackward performs backward dropout operation over dy returning results in dx. If during forward dropout operation value from x was propagated to y then during backward operation value from dy will be propagated to dx, otherwise, dx value will be set to 0.",
-	"cudnnDropoutForward":                                "cudnnDropoutForward performs forward dropout operation over x returning results in y. If dropout was used as a parameter to cudnnSetDropoutDescriptor, the approximately dropout fraction of x values will be replaces by 0, and the rest will be scaled by 1/(1-dropout) cudnnDropoutForward should not be running concurrently with another cudnnDropoutForward function using the same states.",
-	"cudnnDropoutGetReserveSpaceSize":                    "cudnnDropoutGetReserveSpaceSize is used to query the amount of reserve needed to run dropout with the input dimensions given by xDesc. The same reserve space is expected to be passed to cudnnDropoutForward and cudnnDropoutBackward, and its contents is expected to remain unchanged between cudnnDropoutForward and cudnnDropoutBackward calls.",
-	"cudnnDropoutGetStatesSize":                          "cudnnDropoutGetStatesSize is used to query the amount of space required to store the states of the random number generators used by cudnnDropoutForward function.",
-	"cudnnFindConvolutionBackwardDataAlgorithm":          "cudnnFindConvolutionBackwardDataAlgorithm attempts all cuDNN algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) for cudnnConvolutionBackwardData(), using memory allocated via cudaMalloc() and outputs performance metrics to a user-allocated array of cudnnConvolutionBwdDataAlgoPerf_t. These metrics are written in sorted fashion where the first element has the lowest compute time. The total number of resulting algorithms can be queried through the API cudnnGetConvolutionBackwardMaxCount().",
-	"cudnnFindConvolutionBackwardDataAlgorithmEx":        "cudnnFindConvolutionBackwardDataAlgorithmEx attempts all cuDNN algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) for cudnnConvolutionBackwardData, using user-allocated GPU memory, and outputs performance metrics to a user-allocated array of cudnnConvolutionBwdDataAlgoPerf_t. These metrics are written in sorted fashion where the first element has the lowest compute time. The total number of resulting algorithms can be queried through the API cudnnGetConvolutionBackwardMaxCount().",
-	"cudnnFindConvolutionBackwardFilterAlgorithm":        "cudnnFindConvolutionBackwardFilterAlgorithm attempts all cuDNN algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) for cudnnConvolutionBackwardFilter(), using GPU memory allocated via cudaMalloc(), and outputs performance metrics to a user-allocated array of cudnnConvolutionBwdFilterAlgoPerf_t. These metrics are written in sorted fashion where the first element has the lowest compute time. The total number of resulting algorithms can be queried through the API cudnnGetConvolutionBackwardMaxCount().",
-	"cudnnFindConvolutionBackwardFilterAlgorithmEx":      "cudnnFindConvolutionBackwardFilterAlgorithmEx attempts all cuDNN algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) for cudnnConvolutionBackwardFilter, using user-allocated GPU memory, and outputs performance metrics to a user-allocated array of cudnnConvolutionBwdFilterAlgoPerf_t. These metrics are written in sorted fashion where the first element has the lowest compute time. The total number of resulting algorithms can be queried through the API cudnnGetConvolutionBackwardMaxCount().",
-	"cudnnFindConvolutionForwardAlgorithm":               "cudnnFindConvolutionForwardAlgorithm attempts all cuDNN algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) for cudnnConvolutionForward(), using memory allocated via cudaMalloc(), and outputs performance metrics to a user-allocated array of cudnnConvolutionFwdAlgoPerf_t. These metrics are written in sorted fashion where the first element has the lowest compute time. The total number of resulting algorithms can be queried through the API cudnnGetConvolutionForwardMaxCount().",
-	"cudnnFindConvolutionForwardAlgorithmEx":             "cudnnFindConvolutionForwardAlgorithmEx attempts all available cuDNN algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) for cudnnConvolutionForward, using user-allocated GPU memory, and outputs performance metrics to a user-allocated array of cudnnConvolutionFwdAlgoPerf_t. These metrics are written in sorted fashion where the first element has the lowest compute time. The total number of resulting algorithms can be queried through the API cudnnGetConvolutionForwardMaxCount().",
-	"cudnnFindRNNBackwardDataAlgorithmEx":                "(New for 7.1)",
-	"cudnnFindRNNBackwardWeightsAlgorithmEx":             "(New for 7.1)",
-	"cudnnFindRNNForwardInferenceAlgorithmEx":            "(New for 7.1)",
-	"cudnnFindRNNForwardTrainingAlgorithmEx":             "(New for 7.1)",
-	"cudnnGetActivationDescriptor":                       "cudnnGetActivationDescriptor queries a previously initialized generic activation descriptor object.",
-	"cudnnGetAlgorithmDescriptor":                        "(New for 7.1)",
-	"cudnnGetAlgorithmPerformance":                       "(New for 7.1)",
-	"cudnnGetAlgorithmSpaceSize":                         "(New for 7.1)",
-	"cudnnGetCTCLossDescriptor":                          "cudnnGetCTCLossDescriptor returns configuration of the passed CTC loss function descriptor.",
-	"cudnnGetCTCLossWorkspaceSize":                       "cudnnGetCTCLossWorkspaceSize returns the amount of GPU memory workspace the user needs to allocate to be able to call cudnnCTCLoss with the specified algorithm. The workspace allocated will then be passed to the routine cudnnCTCLoss.",
-	"cudnnGetCallback":                                   "(New for 7.1)",
-	"cudnnGetConvolution2dDescriptor":                    "cudnnGetConvolution2dDescriptor queries a previously initialized 2D convolution descriptor object.",
-	"cudnnGetConvolution2dForwardOutputDim":              "cudnnGetConvolution2dForwardOutputDim returns the dimensions of the resulting 4D tensor of a 2D convolution, given the convolution descriptor, the input tensor descriptor and the filter descriptor cudnnGetConvolution2dForwardOutputDim can help to setup the output tensor and allocate the proper amount of memory prior to launch the actual convolution.",
-	"cudnnGetConvolutionBackwardDataAlgorithm":           "cudnnGetConvolutionBackwardDataAlgorithm serves as a heuristic for obtaining the best suited algorithm for cudnnConvolutionBackwardData for the given layer specifications. Based on the input preference, this function will either return the fastest algorithm or the fastest algorithm within a given memory limit. For an exhaustive search for the fastest algorithm, please use cudnnFindConvolutionBackwardDataAlgorithm.",
-	"cudnnGetConvolutionBackwardDataAlgorithmMaxCount":   "cudnnGetConvolutionBackwardDataAlgorithmMaxCount returns the maximum number of algorithms which can be returned from cudnnFindConvolutionBackwardDataAlgorithm() and cudnnGetConvolutionForwardAlgorithm_v7(). cudnnGetConvolutionBackwardDataAlgorithmMaxCount is the sum of all algorithms plus the sum of all algorithms with Tensor Core operations supported for the current device.",
-	"cudnnGetConvolutionBackwardDataAlgorithm_v7":        "cudnnGetConvolutionBackwardDataAlgorithm_v7 serves as a heuristic for obtaining the best suited algorithm for cudnnConvolutionBackwardData for the given layer specifications. cudnnGetConvolutionBackwardDataAlgorithm_v7 will return all algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) sorted by expected (based on internal heuristic) relative performance with fastest being index 0 of perfResults. For an exhaustive search for the fastest algorithm, please use cudnnFindConvolutionBackwardDataAlgorithm. The total number of resulting algorithms can be queried through the API cudnnGetConvolutionBackwardMaxCount().",
-	"cudnnGetConvolutionBackwardDataWorkspaceSize":       "cudnnGetConvolutionBackwardDataWorkspaceSize returns the amount of GPU memory workspace the user needs to allocate to be able to call cudnnConvolutionBackwardData with the specified algorithm. The workspace allocated will then be passed to the routine cudnnConvolutionBackwardData. The specified algorithm can be the result of the call to cudnnGetConvolutionBackwardDataAlgorithm or can be chosen arbitrarily by the user. Note that not every algorithm is available for every configuration of the input tensor and/or every configuration of the convolution descriptor.",
-	"cudnnGetConvolutionBackwardFilterAlgorithm":         "cudnnGetConvolutionBackwardFilterAlgorithm serves as a heuristic for obtaining the best suited algorithm for cudnnConvolutionBackwardFilter for the given layer specifications. Based on the input preference, this function will either return the fastest algorithm or the fastest algorithm within a given memory limit. For an exhaustive search for the fastest algorithm, please use cudnnFindConvolutionBackwardFilterAlgorithm.",
-	"cudnnGetConvolutionBackwardFilterAlgorithmMaxCount": "cudnnGetConvolutionBackwardFilterAlgorithmMaxCount returns the maximum number of algorithms which can be returned from cudnnFindConvolutionBackwardFilterAlgorithm() and cudnnGetConvolutionForwardAlgorithm_v7(). cudnnGetConvolutionBackwardFilterAlgorithmMaxCount is the sum of all algorithms plus the sum of all algorithms with Tensor Core operations supported for the current device.",
-	"cudnnGetConvolutionBackwardFilterAlgorithm_v7":      "cudnnGetConvolutionBackwardFilterAlgorithm_v7 serves as a heuristic for obtaining the best suited algorithm for cudnnConvolutionBackwardFilter for the given layer specifications. cudnnGetConvolutionBackwardFilterAlgorithm_v7 will return all algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) sorted by expected (based on internal heuristic) relative performance with fastest being index 0 of perfResults. For an exhaustive search for the fastest algorithm, please use cudnnFindConvolutionBackwardFilterAlgorithm. The total number of resulting algorithms can be queried through the API cudnnGetConvolutionBackwardMaxCount().",
-	"cudnnGetConvolutionBackwardFilterWorkspaceSize":     "cudnnGetConvolutionBackwardFilterWorkspaceSize returns the amount of GPU memory workspace the user needs to allocate to be able to call cudnnConvolutionBackwardFilter with the specified algorithm. The workspace allocated will then be passed to the routine cudnnConvolutionBackwardFilter. The specified algorithm can be the result of the call to cudnnGetConvolutionBackwardFilterAlgorithm or can be chosen arbitrarily by the user. Note that not every algorithm is available for every configuration of the input tensor and/or every configuration of the convolution descriptor.",
-	"cudnnGetConvolutionForwardAlgorithm":                "cudnnGetConvolutionForwardAlgorithm serves as a heuristic for obtaining the best suited algorithm for cudnnConvolutionForward for the given layer specifications. Based on the input preference, this function will either return the fastest algorithm or the fastest algorithm within a given memory limit. For an exhaustive search for the fastest algorithm, please use cudnnFindConvolutionForwardAlgorithm.",
-	"cudnnGetConvolutionForwardAlgorithmMaxCount":        "cudnnGetConvolutionForwardAlgorithmMaxCount returns the maximum number of algorithms which can be returned from cudnnFindConvolutionForwardAlgorithm() and cudnnGetConvolutionForwardAlgorithm_v7(). cudnnGetConvolutionForwardAlgorithmMaxCount is the sum of all algorithms plus the sum of all algorithms with Tensor Core operations supported for the current device.",
-	"cudnnGetConvolutionForwardAlgorithm_v7":             "cudnnGetConvolutionForwardAlgorithm_v7 serves as a heuristic for obtaining the best suited algorithm for cudnnConvolutionForward for the given layer specifications. cudnnGetConvolutionForwardAlgorithm_v7 will return all algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) sorted by expected (based on internal heuristic) relative performance with fastest being index 0 of perfResults. For an exhaustive search for the fastest algorithm, please use cudnnFindConvolutionForwardAlgorithm. The total number of resulting algorithms can be queried through the API cudnnGetConvolutionForwardMaxCount().",
-	"cudnnGetConvolutionForwardWorkspaceSize":            "cudnnGetConvolutionForwardWorkspaceSize returns the amount of GPU memory workspace the user needs to allocate to be able to call cudnnConvolutionForward with the specified algorithm. The workspace allocated will then be passed to the routine cudnnConvolutionForward. The specified algorithm can be the result of the call to cudnnGetConvolutionForwardAlgorithm or can be chosen arbitrarily by the user. Note that not every algorithm is available for every configuration of the input tensor and/or every configuration of the convolution descriptor.",
-	"cudnnGetConvolutionNdDescriptor":                    "cudnnGetConvolutionNdDescriptor queries a previously initialized convolution descriptor object.",
-	"cudnnGetConvolutionNdForwardOutputDim":              "cudnnGetConvolutionNdForwardOutputDim returns the dimensions of the resulting n-D tensor of a nbDims-2-D convolution, given the convolution descriptor, the input tensor descriptor and the filter descriptor cudnnGetConvolutionNdForwardOutputDim can help to setup the output tensor and allocate the proper amount of memory prior to launch the actual convolution.",
-	"cudnnGetDropoutDescriptor":                          "cudnnGetDropoutDescriptor queries the fields of a previously initialized dropout descriptor.",
-	"cudnnGetErrorString":                                "cudnnGetErrorString converts the cuDNN status code to a NUL terminated (ASCIIZ) static string. For example, when the input argument is CUDNN_STATUS_SUCCESS, the returned string is `CUDNN_STATUS_SUCCESS`. When an invalid status value is passed to the function, the returned string is `CUDNN_UNKNOWN_STATUS`.",
-	"cudnnGetFilter4dDescriptor":                         "cudnnGetFilter4dDescriptor queries the parameters of the previouly initialized filter descriptor object.",
-	"cudnnGetFilterNdDescriptor":                         "cudnnGetFilterNdDescriptor queries a previously initialized filter descriptor object.",
-	"cudnnGetLRNDescriptor":                              "cudnnGetLRNDescriptor retrieves values stored in the previously initialized LRN descriptor object.",
-	"cudnnGetOpTensorDescriptor":                         "cudnnGetOpTensorDescriptor returns configuration of the passed Tensor Pointwise math descriptor.",
-	"cudnnGetPooling2dDescriptor":                        "cudnnGetPooling2dDescriptor queries a previously created 2D pooling descriptor object.",
-	"cudnnGetPooling2dForwardOutputDim":                  "cudnnGetPooling2dForwardOutputDim provides the output dimensions of a tensor after 2d pooling has been applied",
-	"cudnnGetPoolingNdDescriptor":                        "cudnnGetPoolingNdDescriptor queries a previously initialized generic pooling descriptor object.",
-	"cudnnGetPoolingNdForwardOutputDim":                  "cudnnGetPoolingNdForwardOutputDim provides the output dimensions of a tensor after Nd pooling has been applied",
-	"cudnnGetProperty":                                   "cudnnGetProperty writes a specific part of the cuDNN library version number into the provided host storage.",
-	"cudnnGetRNNDataDescriptor":                          "cudnnGetRNNDataDescriptor retrieves a previously created RNN data descriptor object.",
-	"cudnnGetRNNDescriptor":                              "cudnnGetRNNDescriptor retrieves RNN network parameters that were configured by cudnnSetRNNDescriptor(). All pointers passed to the function should be not-NULL or CUDNN_STATUS_BAD_PARAM is reported. The function does not check the validity of retrieved network parameters. The parameters are verified when they are written to the RNN descriptor.",
-	"cudnnGetRNNLinLayerBiasParams":                      "cudnnGetRNNLinLayerBiasParams is used to obtain a pointer and a descriptor of every RNN bias column vector in each pseudo-layer within the recurrent network defined by rnnDesc and its input width specified in xDesc.",
-	"cudnnGetRNNLinLayerMatrixParams":                    "cudnnGetRNNLinLayerMatrixParams is used to obtain a pointer and a descriptor of every RNN weight matrix in each pseudo-layer within the recurrent network defined by rnnDesc and its input width specified in xDesc.",
-	"cudnnGetRNNParamsSize":                              "cudnnGetRNNParamsSize is used to query the amount of parameter space required to execute the RNN described by rnnDesc with inputs dimensions defined by xDesc.",
-	"cudnnGetRNNPaddingMode":                             "cudnnGetRNNPaddingMode retrieves the RNN padding mode from the RNN descriptor.",
-	"cudnnGetRNNProjectionLayers":                        "(New for 7.1)",
-	"cudnnGetRNNTrainingReserveSize":                     "cudnnGetRNNTrainingReserveSize is used to query the amount of reserved space required for training the RNN described by rnnDesc with inputs dimensions defined by xDesc. The same reserved space buffer must be passed to cudnnRNNForwardTraining, cudnnRNNBackwardData and cudnnRNNBackwardWeights. Each of these calls overwrites the contents of the reserved space, however it can safely be backed up and restored between calls if reuse of the memory is desired.",
-	"cudnnGetRNNWorkspaceSize":                           "cudnnGetRNNWorkspaceSize is used to query the amount of work space required to execute the RNN described by rnnDesc with inputs dimensions defined by xDesc.",
-	"cudnnGetReduceTensorDescriptor":                     "cudnnGetReduceTensorDescriptor queries a previously initialized reduce tensor descriptor object.",
-	"cudnnGetReductionIndicesSize":                       "cudnnGetReductionIndicesSize is a helper function to return the minimum size of the index space to be passed to the reduction given the input and output tensors.",
-	"cudnnGetReductionWorkspaceSize":                     "cudnnGetReductionWorkspaceSize is a helper function to return the minimum size of the workspace to be passed to the reduction given the input and output tensors.",
-	"cudnnGetStream":                                     "cudnnGetStream retrieves the user CUDA stream programmed in the cuDNN handle. When the user's CUDA stream was not set in the cuDNN handle, this function reports the null-stream.",
-	"cudnnGetTensor4dDescriptor":                         "cudnnGetTensor4dDescriptor queries the parameters of the previouly initialized Tensor4D descriptor object.",
-	"cudnnGetTensorNdDescriptor":                         "cudnnGetTensorNdDescriptor retrieves values stored in a previously initialized Tensor descriptor object.",
-	"cudnnGetTensorSizeInBytes":                          "cudnnGetTensorSizeInBytes returns the size of the tensor in memory in respect to the given descriptor. cudnnGetTensorSizeInBytes can be used to know the amount of GPU memory to be allocated to hold that tensor.",
-	"cudnnIm2Col":                                        "cudnnIm2Col constructs the A matrix necessary to perform a forward pass of GEMM convolution. cudnnIm2Col A matrix has a height of batch_size*y_height*y_width and width of input_channels*filter_height*filter_width, where batch_size is xDesc's first dimension, y_height/y_width are computed from cudnnGetConvolutionNdForwardOutputDim(), input_channels is xDesc's second dimension, filter_height/filter_width are wDesc's third and fourth dimension. The A matrix is stored in format HW-fully-packed in GPU memory.",
-	"cudnnLRNCrossChannelBackward":                       "cudnnLRNCrossChannelBackward performs the backward LRN layer computation.",
-	"cudnnLRNCrossChannelForward":                        "cudnnLRNCrossChannelForward performs the forward LRN layer computation.",
-	"cudnnOpTensor":                                      "cudnnOpTensor implements the equation C = op ( alpha1[0] * A, alpha2[0] * B ) + beta[0] * C, given tensors A, B, and C and scaling factors alpha1, alpha2, and beta. The op to use is indicated by the descriptor opTensorDesc. Currently-supported ops are listed by the cudnnOpTensorOp_t enum.",
-	"cudnnPoolingBackward":                               "cudnnPoolingBackward computes the gradient of a pooling operation.",
-	"cudnnPoolingForward":                                "cudnnPoolingForward computes pooling of input values (i.e., the maximum or average of several adjacent values) to produce an output with smaller height and/or width.",
-	"cudnnQueryRuntimeError":                             "cuDNN library functions perform extensive input argument checking before launching GPU kernels. The last step is to verify that the GPU kernel actually started. When a kernel fails to start, CUDNN_STATUS_EXECUTION_FAILED is returned by the corresponding API call. Typically, after a GPU kernel starts, no runtime checks are performed by the kernel itself -- numerical results are simply written to output buffers.",
-	"cudnnRNNBackwardData":                               "cudnnRNNBackwardData executes the recurrent neural network described by rnnDesc with output gradients dy, dhy, dhc, weights w and input gradients dx, dhx, dcx. workspace is required for intermediate storage. The data in reserveSpace must have previously been generated by cudnnRNNForwardTraining. The same reserveSpace data must be used for future calls to cudnnRNNBackwardWeights if they execute on the same input data.",
-	"cudnnRNNBackwardDataEx":                             "cudnnRNNBackwardDataEx is the extended version of the function cudnnRNNBackwardData. cudnnRNNBackwardDataEx cudnnRNNBackwardDataEx allows the user to use unpacked (padded) layout for input y and output dx.",
-	"cudnnRNNBackwardWeights":                            "cudnnRNNBackwardWeights accumulates weight gradients dw from the recurrent neural network described by rnnDesc with inputs x, hx, and outputs y. The mode of operation in this case is additive, the weight gradients calculated will be added to those already existing in dw. workspace is required for intermediate storage. The data in reserveSpace must have previously been generated by cudnnRNNBackwardData.",
-	"cudnnRNNBackwardWeightsEx":                          "cudnnRNNBackwardWeightsEx is the extended version of the function cudnnRNNBackwardWeights. cudnnRNNBackwardWeightsEx cudnnRNNBackwardWeightsEx allows the user to use unpacked (padded) layout for input x and output dw.",
-	"cudnnRNNForwardInference":                           "cudnnRNNForwardInference executes the recurrent neural network described by rnnDesc with inputs x, hx, cx, weights w and outputs y, hy, cy. workspace is required for intermediate storage. cudnnRNNForwardInference does not store intermediate data required for training; cudnnRNNForwardTraining should be used for that purpose.",
-	"cudnnRNNForwardInferenceEx":                         "cudnnRNNForwardInferenceEx is the extended version of the cudnnRNNForwardInference function. The cudnnRNNForwardTrainingEx allows the user to use unpacked (padded) layout for input x and output y. In the unpacked layout, each sequence in the mini-batch is considered to be of fixed length, specified by maxSeqLength in its corresponding RNNDataDescriptor. Each fixed-length sequence, for example, the nth sequence in the mini-batch, is composed of a valid segment, specified by the seqLengthArray[n] in its corresponding RNNDataDescriptor, and a padding segment to make the combined sequence length equal to maxSeqLength.",
-	"cudnnRNNForwardTraining":                            "cudnnRNNForwardTraining executes the recurrent neural network described by rnnDesc with inputs x, hx, cx, weights w and outputs y, hy, cy. workspace is required for intermediate storage. reserveSpace stores data required for training. The same reserveSpace data must be used for future calls to cudnnRNNBackwardData and cudnnRNNBackwardWeights if these execute on the same input data.",
-	"cudnnRNNForwardTrainingEx":                          "cudnnRNNForwardTrainingEx is the extended version of the cudnnRNNForwardTraining function. The cudnnRNNForwardTrainingEx allows the user to use unpacked (padded) layout for input x and output y.",
-	"cudnnRNNGetClip":                                    "Retrieves the current LSTM cell clipping parameters, and stores them in the arguments provided.",
-	"cudnnRNNSetClip":                                    "Sets the LSTM cell clipping mode. The LSTM clipping is disabled by default. When enabled, clipping is applied to all layers. cudnnRNNSetClip cudnnRNNSetClip() function may be called multiple times.",
-	"cudnnReduceTensor":                                  "cudnnReduceTensor reduces tensor A by implementing the equation C = alpha * reduce op ( A ) + beta * C, given tensors A and C and scaling factors alpha and beta. The reduction op to use is indicated by the descriptor reduceTensorDesc. Currently-supported ops are listed by the cudnnReduceTensorOp_t enum.",
-	"cudnnRestoreAlgorithm":                              "(New for 7.1)",
-	"cudnnRestoreDropoutDescriptor":                      "cudnnRestoreDropoutDescriptor restores a dropout descriptor to a previously saved-off state.",
-	"cudnnSaveAlgorithm":                                 "(New for 7.1)",
-	"cudnnScaleTensor":                                   "cudnnScaleTensor scale all the elements of a tensor by a given factor.",
-	"cudnnSetActivationDescriptor":                       "cudnnSetActivationDescriptor initializes a previously created generic activation descriptor object.",
-	"cudnnSetAlgorithmDescriptor":                        "(New for 7.1)",
-	"cudnnSetAlgorithmPerformance":                       "(New for 7.1)",
-	"cudnnSetCTCLossDescriptor":                          "cudnnSetCTCLossDescriptor sets a CTC loss function descriptor.",
-	"cudnnSetCallback":                                   "(New for 7.1)",
-	"cudnnSetConvolution2dDescriptor":                    "cudnnSetConvolution2dDescriptor initializes a previously created convolution descriptor object into a 2D correlation. cudnnSetConvolution2dDescriptor assumes that the tensor and filter descriptors corresponds to the formard convolution path and checks if their settings are valid. That same convolution descriptor can be reused in the backward path provided it corresponds to the same layer.",
-	"cudnnSetConvolutionNdDescriptor":                    "cudnnSetConvolutionNdDescriptor initializes a previously created generic convolution descriptor object into a n-D correlation. That same convolution descriptor can be reused in the backward path provided it corresponds to the same layer. The convolution computation will done in the specified dataType, which can be potentially different from the input/output tensors.",
-	"cudnnSetDropoutDescriptor":                          "cudnnSetDropoutDescriptor initializes a previously created dropout descriptor object. If states argument is equal to NULL, random number generator states won't be initialized, and only dropout value will be set. No other function should be writing to the memory pointed at by states argument while this function is running. The user is expected not to change memory pointed at by states for the duration of the computation.",
-	"cudnnSetFilter4dDescriptor":                         "cudnnSetFilter4dDescriptor initializes a previously created filter descriptor object into a 4D filter. The layout of the filters must be contiguous in memory.",
-	"cudnnSetFilterNdDescriptor":                         "cudnnSetFilterNdDescriptor initializes a previously created filter descriptor object. The layout of the filters must be contiguous in memory.",
-	"cudnnSetLRNDescriptor":                              "cudnnSetLRNDescriptor initializes a previously created LRN descriptor object.",
-	"cudnnSetOpTensorDescriptor":                         "cudnnSetOpTensorDescriptor initializes a Tensor Pointwise math descriptor.",
-	"cudnnSetPooling2dDescriptor":                        "cudnnSetPooling2dDescriptor initializes a previously created generic pooling descriptor object into a 2D description.",
-	"cudnnSetPoolingNdDescriptor":                        "cudnnSetPoolingNdDescriptor initializes a previously created generic pooling descriptor object.",
-	"cudnnSetRNNDataDescriptor":                          "cudnnSetRNNDataDescriptor initializes a previously created RNN data descriptor object. cudnnSetRNNDataDescriptor data structure is intended to support the unpacked (padded) layout for input and output of extended RNN inference and training functions. A packed (unpadded) layout is also supported for backward compatibility.",
-	"cudnnSetRNNDescriptor":                              "cudnnSetRNNDescriptor initializes a previously created RNN descriptor object.",
-	"cudnnSetRNNDescriptor_v5":                           "cudnnSetRNNDescriptor_v5 initializes a previously created RNN descriptor object.",
-	"cudnnSetRNNDescriptor_v6":                           "cudnnSetRNNDescriptor_v6 initializes a previously created RNN descriptor object.",
-	"cudnnSetRNNMatrixMathType":                          "cudnnSetRNNMatrixMathType sets the preferred option to use NVIDIA Tensor Cores accelerators on Volta GPU-s (SM 7.0 or higher). When the mType parameter is CUDNN_TENSOR_OP_MATH, inference and training RNN API-s will attempt use Tensor Cores when weights/biases are of type CUDNN_DATA_HALF or CUDNN_DATA_FLOAT. When RNN weights/biases are stored in the CUDNN_DATA_FLOAT format, the original weights and intermediate results will be down-converted to CUDNN_DATA_HALF before they are used in another recursive iteration.",
-	"cudnnSetRNNPaddingMode":                             "cudnnSetRNNPaddingMode enables or disables the padded RNN input/output for a previously created and initialized RNN descriptor. cudnnSetRNNPaddingMode information is required before calling the cudnnGetRNNWorkspaceSize and cudnnGetRNNTrainingReserveSize functions, to determine whether additional workspace and training reserve space is needed. By default the padded RNN input/output is not enabled.",
-	"cudnnSetRNNProjectionLayers":                        "(New for 7.1)",
-	"cudnnSetReduceTensorDescriptor":                     "cudnnSetReduceTensorDescriptor initializes a previously created reduce tensor descriptor object.",
-	"cudnnSetSpatialTransformerNdDescriptor":             "cudnnSetSpatialTransformerNdDescriptor initializes a previously created generic spatial transformer descriptor object.",
-	"cudnnSetStream":                                     "cudnnSetStream sets the user's CUDA stream in the cuDNN handle. The new stream will be used to launch cuDNN GPU kernels or to synchronize to this stream when cuDNN kernels are launched in the internal streams. If the cuDNN library stream is not set, all kernels use the default (NULL) stream. Setting the user stream in the cuDNN handle guarantees the issue-order execution of cuDNN calls and other GPU kernels launched in the same stream.",
-	"cudnnSetTensor":                                     "cudnnSetTensor sets all the elements of a tensor to a given value.",
-	"cudnnSetTensor4dDescriptor":                         "cudnnSetTensor4dDescriptor initializes a previously created generic Tensor descriptor object into a 4D tensor. The strides of the four dimensions are inferred from the format parameter and set in such a way that the data is contiguous in memory with no padding between dimensions.",
-	"cudnnSetTensor4dDescriptorEx":                       "cudnnSetTensor4dDescriptorEx initializes a previously created generic Tensor descriptor object into a 4D tensor, similarly to cudnnSetTensor4dDescriptor but with the strides explicitly passed as parameters. cudnnSetTensor4dDescriptorEx can be used to lay out the 4D tensor in any order or simply to define gaps between dimensions.",
-	"cudnnSetTensorNdDescriptor":                         "cudnnSetTensorNdDescriptor initializes a previously created generic Tensor descriptor object.",
-	"cudnnSetTensorNdDescriptorEx":                       "cudnnSetTensorNdDescriptorEx initializes an n-D tensor descriptor.",
-	"cudnnSoftmaxBackward":                               "cudnnSoftmaxBackward computes the gradient of the softmax function.",
-	"cudnnSoftmaxForward":                                "cudnnSoftmaxForward computes the softmax function.",
-	"cudnnSpatialTfGridGeneratorBackward":                "cudnnSpatialTfGridGeneratorBackward computes the gradient of a grid generation operation.",
-	"cudnnSpatialTfGridGeneratorForward":                 "cudnnSpatialTfGridGeneratorForward generates a grid of coordinates in the input tensor corresponding to each pixel from the output tensor.",
-	"cudnnSpatialTfSamplerBackward":                      "cudnnSpatialTfSamplerBackward computes the gradient of a sampling operation.",
-	"cudnnSpatialTfSamplerForward":                       "cudnnSpatialTfSamplerForward performs a sampler operation and generates the output tensor using the grid given by the grid generator.",
-	"cudnnTransformTensor":                               "cudnnTransformTensor copies the scaled data from one tensor to another tensor with a different layout. Those descriptors need to have the same dimensions but not necessarily the same strides. The input and output tensors must not overlap in any way (i.e., tensors cannot be transformed in place). cudnnTransformTensor can be used to convert a tensor with an unsupported format to a supported one.",
+	"cudnnActivationForward":                                   "Input. Handle to a previously created cuDNN context. For more information, see cudnnHandle_t.",
+	"cudnnAddTensor":                                           "cudnnAddTensor adds the scaled values of a bias tensor to another tensor. Each dimension of the bias tensor A must match the corresponding dimension of the destination tensor C or must be equal to 1. In the latter case, the same value from the bias tensor for those dimensions will be used to blend into the C tensor.",
+	"cudnnBatchNormalizationForwardInference":                  "Input. Handle to a previously created cuDNN library descriptor. For more information, see cudnnHandle_t.",
+	"cudnnCreate":                                              "cudnnCreate initializes the cuDNN library and creates a handle to an opaque structure holding the cuDNN library context. It allocates hardware resources on the host and device and must be called prior to making any other cuDNN library calls.",
+	"cudnnCreateActivationDescriptor":                          "cudnnCreateActivationDescriptor creates an activation descriptor object by allocating the memory needed to hold its opaque structure. For more information, see cudnnActivationDescriptor_t.",
+	"cudnnCreateAlgorithmDescriptor":                           "cudnnCreateAlgorithmDescriptor has been deprecated in cuDNN 8.0.",
+	"cudnnCreateAlgorithmPerformance":                          "cudnnCreateAlgorithmPerformance creates multiple algorithm performance objects by allocating the memory needed to hold their opaque structures.",
+	"cudnnCreateDropoutDescriptor":                             "cudnnCreateDropoutDescriptor creates a generic dropout descriptor object by allocating the memory needed to hold its opaque structure. For more information, see cudnnDropoutDescriptor_t.",
+	"cudnnCreateFilterDescriptor":                              "cudnnCreateFilterDescriptor creates a filter descriptor object by allocating the memory needed to hold its opaque structure. For more information, see cudnnFilterDescriptor_t.",
+	"cudnnCreateLRNDescriptor":                                 "cudnnCreateLRNDescriptor allocates the memory needed to hold the data needed for LRN and DivisiveNormalization layers operation and returns a descriptor used with subsequent layer forward and backward calls.",
+	"cudnnCreateOpTensorDescriptor":                            "cudnnCreateOpTensorDescriptor creates a tensor pointwise math descriptor. For more information, see cudnnOpTensorDescriptor_t.",
+	"cudnnCreatePoolingDescriptor":                             "cudnnCreatePoolingDescriptor creates a pooling descriptor object by allocating the memory needed to hold its opaque structure.",
+	"cudnnCreateReduceTensorDescriptor":                        "cudnnCreateReduceTensorDescriptor creates a reduced tensor descriptor object by allocating the memory needed to hold its opaque structure.",
+	"cudnnCreateSpatialTransformerDescriptor":                  "cudnnCreateSpatialTransformerDescriptor creates a generic spatial transformer descriptor object by allocating the memory needed to hold its opaque structure.",
+	"cudnnCreateTensorDescriptor":                              "cudnnCreateTensorDescriptor creates a generic tensor descriptor object by allocating the memory needed to hold its opaque structure. The data is initialized to all zeros.",
+	"cudnnCreateTensorTransformDescriptor":                     "cudnnCreateTensorTransformDescriptor creates a tensor transform descriptor object by allocating the memory needed to hold its opaque structure. The tensor data is initialized to be all zero. Use the cudnnSetTensorTransformDescriptor() function to initialize the descriptor created by this function.",
+	"cudnnDeriveBNTensorDescriptor":                            "cudnnDeriveBNTensorDescriptor derives a secondary tensor descriptor for the batch normalization scale, invVariance, bnBias, and bnScale subtensors from the layer's x data descriptor.",
+	"cudnnDeriveNormTensorDescriptor":                          "cudnnDeriveNormTensorDescriptor derives tensor descriptors for the normalization mean, invariance, normBias, and normScale subtensors from the layer's x data descriptor and norm mode. normalization, mean, and invariance share the same descriptor while bias and scale share the same descriptor.",
+	"cudnnDestroy":                                             "cudnnDestroy releases the resources used by the cuDNN handle. cudnnDestroy is usually the last call with a particular handle to the cuDNN handle. Because cudnnCreate() allocates some internal resources, the release of those resources by calling cudnnDestroy() will implicitly call cudaDeviceSynchronize; therefore, the recommended best practice is to call cudnnCreate/cudnnDestroy outside of performance-critical code paths.",
+	"cudnnDestroyActivationDescriptor":                         "cudnnDestroyActivationDescriptor destroys a previously created activation descriptor object.",
+	"cudnnDestroyAlgorithmDescriptor":                          "cudnnDestroyAlgorithmDescriptor has been deprecated in cuDNN 8.0.",
+	"cudnnDestroyAlgorithmPerformance":                         "cudnnDestroyAlgorithmPerformance destroys a previously created algorithm descriptor object.",
+	"cudnnDestroyDropoutDescriptor":                            "cudnnDestroyDropoutDescriptor destroys a previously created dropout descriptor object.",
+	"cudnnDestroyFilterDescriptor":                             "cudnnDestroyFilterDescriptor destroys a previously created tensor 4D descriptor object.",
+	"cudnnDestroyLRNDescriptor":                                "cudnnDestroyLRNDescriptor destroys a previously created LRN descriptor object.",
+	"cudnnDestroyOpTensorDescriptor":                           "cudnnDestroyOpTensorDescriptor deletes a tensor pointwise math descriptor object.",
+	"cudnnDestroyPoolingDescriptor":                            "cudnnDestroyPoolingDescriptor destroys a previously created pooling descriptor object.",
+	"cudnnDestroyReduceTensorDescriptor":                       "cudnnDestroyReduceTensorDescriptor destroys a previously created reduce tensor descriptor object. When the input pointer is NULL, this function performs no destroy operation.",
+	"cudnnDestroySpatialTransformerDescriptor":                 "cudnnDestroySpatialTransformerDescriptor destroys a previously created spatial transformer descriptor object.",
+	"cudnnDestroyTensorDescriptor":                             "cudnnDestroyTensorDescriptor destroys a previously created tensor descriptor object. When the input pointer is NULL, this function performs no destroy operation.",
+	"cudnnDestroyTensorTransformDescriptor":                    "Destroys a previously created tensor transform descriptor.",
+	"cudnnDivisiveNormalizationForward":                        "The x-mean(x) which is often referred to as `subtractive normalization` portion of the computation can be implemented using cuDNN average pooling layer followed by a call to addTensor.",
+	"cudnnDropoutForward":                                      "cudnnDropoutForward performs forward dropout operation over x returning results in y. If dropout was used as a parameter to cudnnSetDropoutDescriptor(), the approximately dropout fraction of x values will be replaced by a 0, and the rest will be scaled by 1/(1-dropout). cudnnDropoutForward should not be running concurrently with another cudnnDropoutForward() function using the same states.",
+	"cudnnDropoutGetReserveSpaceSize":                          "cudnnDropoutGetReserveSpaceSize is used to query the amount of reserve needed to run dropout with the input dimensions given by xDesc. The same reserve space is expected to be passed to cudnnDropoutForward() and cudnnDropoutBackward(), and its contents is expected to remain unchanged between cudnnDropoutForward() and cudnnDropoutBackward() calls.",
+	"cudnnDropoutGetStatesSize":                                "cudnnDropoutGetStatesSize is used to query the amount of space required to store the states of the random number generators used by cudnnDropoutForward() function.",
+	"cudnnGetActivationDescriptor":                             "cudnnGetActivationDescriptor queries a previously initialized generic activation descriptor object.",
+	"cudnnGetAlgorithmDescriptor":                              "cudnnGetAlgorithmDescriptor has been deprecated in cuDNN 8.0.",
+	"cudnnGetAlgorithmPerformance":                             "cudnnGetAlgorithmPerformance has been deprecated in cuDNN 8.0.",
+	"cudnnGetAlgorithmSpaceSize":                               "cudnnGetAlgorithmSpaceSize has been deprecated in cuDNN 8.0.",
+	"cudnnGetCallback":                                         "cudnnGetCallback queries the internal states of cuDNN error reporting functionality.",
+	"cudnnGetDropoutDescriptor":                                "cudnnGetDropoutDescriptor queries the fields of a previously initialized dropout descriptor.",
+	"cudnnGetErrorString":                                      "cudnnGetErrorString converts the cuDNN status code to a NULL terminated (ASCIIZ) static string. For example, when the input argument is CUDNN_STATUS_SUCCESS, the returned string is CUDNN_STATUS_SUCCESS. When an invalid status value is passed to the function, the returned string is CUDNN_UNKNOWN_STATUS.",
+	"cudnnGetFilter4dDescriptor":                               "cudnnGetFilter4dDescriptor queries the parameters of the previously initialized filter descriptor object.",
+	"cudnnGetFilterNdDescriptor":                               "cudnnGetFilterNdDescriptor queries a previously initialized filter descriptor object.",
+	"cudnnGetFilterSizeInBytes":                                "cudnnGetFilterSizeInBytes returns the size of the filter tensor in memory with respect to the given descriptor. It can be used to know the amount of GPU memory to be allocated to hold that filter tensor.",
+	"cudnnGetLRNDescriptor":                                    "cudnnGetLRNDescriptor retrieves values stored in the previously initialized LRN descriptor object.",
+	"cudnnGetOpTensorDescriptor":                               "cudnnGetOpTensorDescriptor returns the configuration of the passed tensor pointwise math descriptor.",
+	"cudnnGetPooling2dDescriptor":                              "cudnnGetPooling2dDescriptor queries a previously created 2D pooling descriptor object.",
+	"cudnnGetPooling2dForwardOutputDim":                        "cudnnGetPooling2dForwardOutputDim provides the output dimensions of a tensor after 2d pooling has been applied.",
+	"cudnnGetPoolingNdDescriptor":                              "cudnnGetPoolingNdDescriptor queries a previously initialized generic pooling descriptor object.",
+	"cudnnGetPoolingNdForwardOutputDim":                        "cudnnGetPoolingNdForwardOutputDim provides the output dimensions of a tensor after Nd pooling has been applied.",
+	"cudnnGetProperty":                                         "cudnnGetProperty writes a specific part of the cuDNN library version number into the provided host storage.",
+	"cudnnGetReduceTensorDescriptor":                           "cudnnGetReduceTensorDescriptor queries a previously initialized reduce tensor descriptor object.",
+	"cudnnGetReductionIndicesSize":                             "cudnnGetReductionIndicesSize is a helper function to return the minimum size of the index space to be passed to the reduction given the input and output tensors.",
+	"cudnnGetReductionWorkspaceSize":                           "cudnnGetReductionWorkspaceSize is a helper function to return the minimum size of the workspace to be passed to the reduction given the input and output tensors.",
+	"cudnnGetStream":                                           "cudnnGetStream retrieves the user CUDA stream programmed in the cuDNN handle. When the user's CUDA stream is not set in the cuDNN handle, this function reports the null-stream.",
+	"cudnnGetTensor4dDescriptor":                               "cudnnGetTensor4dDescriptor queries the parameters of the previously initialized tensor4D descriptor object.",
+	"cudnnGetTensorNdDescriptor":                               "cudnnGetTensorNdDescriptor retrieves values stored in a previously initialized tensor descriptor object.",
+	"cudnnGetTensorSizeInBytes":                                "cudnnGetTensorSizeInBytes returns the size of the tensor in memory in respect to the given descriptor. cudnnGetTensorSizeInBytes can be used to know the amount of GPU memory to be allocated to hold that tensor.",
+	"cudnnGetTensorTransformDescriptor":                        "cudnnGetTensorTransformDescriptor returns the values stored in a previously initialized tensor transform descriptor.",
+	"cudnnLRNCrossChannelForward":                              "cudnnLRNCrossChannelForward performs the forward LRN layer computation.",
+	"cudnnNormalizationForwardInference":                       "Input. Handle to a previously created cuDNN library descriptor. For more information, see cudnnHandle_t.",
+	"cudnnOpsInferVersionCheck":                                "cudnnOpsInferVersionCheck is the first of a series of corresponding functions that check for consistent library versions among DLL files for different modules.",
+	"cudnnOpTensor":                                            "cudnnOpTensor implements the equation C = op(alpha1[0] * A, alpha2[0] * B) + beta[0] * C, given the tensors A, B, and C and the scaling factors alpha1, alpha2, and beta. The op to use is indicated by the descriptor cudnnOpTensorDescriptor_t, meaning, the type of opTensorDesc. Currently-supported ops are listed by the cudnnOpTensorOp_t enum.",
+	"cudnnPoolingForward":                                      "cudnnPoolingForward computes pooling of input values (meaning, the maximum or average of several adjacent values) to produce an output with smaller height and/or width.",
+	"cudnnQueryRuntimeError":                                   "cuDNN library functions perform extensive input argument checking before launching GPU kernels. The last step is to verify that the GPU kernel actually started. When a kernel fails to start, CUDNN_STATUS_EXECUTION_FAILED is returned by the corresponding API call. Typically, after a GPU kernel starts, no runtime checks are performed by the kernel itself - numerical results are simply written to output buffers.",
+	"cudnnReduceTensor":                                        "cudnnReduceTensor reduces tensor A by implementing the equation C = alpha * reduce op ( A ) + beta * C, given tensors A and C and scaling factors alpha and beta. The reduction op to use is indicated by the descriptor reduceTensorDesc. Currently-supported ops are listed by the cudnnReduceTensorOp_t enum.",
+	"cudnnRestoreAlgorithm":                                    "cudnnRestoreAlgorithm has been deprecated in cuDNN 8.0.",
+	"cudnnRestoreDropoutDescriptor":                            "cudnnRestoreDropoutDescriptor restores a dropout descriptor to a previously saved-off state.",
+	"cudnnSaveAlgorithm":                                       "cudnnSaveAlgorithm has been deprecated in cuDNN 8.0.",
+	"cudnnScaleTensor":                                         "cudnnScaleTensor scales all the elements of a tensor by a given factor.",
+	"cudnnSetActivationDescriptor":                             "cudnnSetActivationDescriptor initializes a previously created generic activation descriptor object.",
+	"cudnnSetAlgorithmDescriptor":                              "cudnnSetAlgorithmDescriptor has been deprecated in cuDNN 8.0.",
+	"cudnnSetAlgorithmPerformance":                             "cudnnSetAlgorithmPerformance has been deprecated in cuDNN 8.0.",
+	"cudnnSetCallback":                                         "cudnnSetCallback sets the internal states of cuDNN error reporting functionality.",
+	"cudnnSetDropoutDescriptor":                                "cudnnSetDropoutDescriptor initializes a previously created dropout descriptor object. If the states argument is equal to NULL, then the random number generator states won't be initialized, and only the dropout value will be set. The user is expected not to change the memory pointed at by states for the duration of the computation.",
+	"cudnnSetFilter4dDescriptor":                               "cudnnSetFilter4dDescriptor initializes a previously created filter descriptor object into a 4D filter. The layout of the filters must be contiguous in memory.",
+	"cudnnSetFilterNdDescriptor":                               "cudnnSetFilterNdDescriptor initializes a previously created filter descriptor object. The layout of the filters must be contiguous in memory.",
+	"cudnnSetLRNDescriptor":                                    "cudnnSetLRNDescriptor initializes a previously created LRN descriptor object.",
+	"cudnnSetOpTensorDescriptor":                               "cudnnSetOpTensorDescriptor initializes a tensor pointwise math descriptor.",
+	"cudnnSetPooling2dDescriptor":                              "cudnnSetPooling2dDescriptor initializes a previously created generic pooling descriptor object into a 2D description.",
+	"cudnnSetPoolingNdDescriptor":                              "cudnnSetPoolingNdDescriptor initializes a previously created generic pooling descriptor object.",
+	"cudnnSetReduceTensorDescriptor":                           "cudnnSetReduceTensorDescriptor initializes a previously created reduce tensor descriptor object.",
+	"cudnnSetSpatialTransformerNdDescriptor":                   "cudnnSetSpatialTransformerNdDescriptor initializes a previously created generic spatial transformer descriptor object.",
+	"cudnnSetStream":                                           "cudnnSetStream sets the user's CUDA stream in the cuDNN handle. The new stream will be used to launch cuDNN GPU kernels or to synchronize to this stream when cuDNN kernels are launched in the internal streams. If the cuDNN library stream is not set, all kernels use the default (NULL) stream. Setting the user stream in the cuDNN handle guarantees the issue-order execution of cuDNN calls and other GPU kernels launched in the same stream.",
+	"cudnnSetTensor":                                           "cudnnSetTensor sets all the elements of a tensor to a given value.",
+	"cudnnSetTensor4dDescriptor":                               "cudnnSetTensor4dDescriptor initializes a previously created generic tensor descriptor object into a 4D tensor. The strides of the four dimensions are inferred from the format parameter and set in such a way that the data is contiguous in memory with no padding between dimensions.",
+	"cudnnSetTensor4dDescriptorEx":                             "cudnnSetTensor4dDescriptorEx initializes a previously created generic tensor descriptor object into a 4D tensor, similarly to cudnnSetTensor4dDescriptor() but with the strides explicitly passed as parameters. cudnnSetTensor4dDescriptorEx can be used to lay out the 4D tensor in any order or simply to define gaps between dimensions.",
+	"cudnnSetTensorNdDescriptor":                               "cudnnSetTensorNdDescriptor initializes a previously created generic tensor descriptor object.",
+	"cudnnSetTensorNdDescriptorEx":                             "cudnnSetTensorNdDescriptorEx initializes an n-D tensor descriptor.",
+	"cudnnSetTensorTransformDescriptor":                        "cudnnSetTensorTransformDescriptor initializes a tensor transform descriptor that was previously created using the cudnnCreateTensorTransformDescriptor() function.",
+	"cudnnSoftmaxForward":                                      "cudnnSoftmaxForward computes the softmax function.",
+	"cudnnSpatialTfGridGeneratorForward":                       "cudnnSpatialTfGridGeneratorForward generates a grid of coordinates in the input tensor corresponding to each pixel from the output tensor.",
+	"cudnnSpatialTfSamplerForward":                             "cudnnSpatialTfSamplerForward performs a sampler operation and generates the output tensor using the grid given by the grid generator.",
+	"cudnnTransformFilter":                                     "cudnnTransformFilter converts the filter between different formats, data types, or dimensions based on the described transformation. It can be used to convert a filter with an unsupported layout format to a filter with a supported layout format.",
+	"cudnnTransformTensor":                                     "cudnnTransformTensor copies the scaled data from one tensor to another tensor with a different layout. Those descriptors need to have the same dimensions but not necessarily the same strides. The input and output tensors must not overlap in any way (meaning, tensors cannot be transformed in place). cudnnTransformTensor can be used to convert a tensor with an unsupported format to a supported one.",
+	"cudnnTransformTensorEx":                                   "cudnnTransformTensorEx converts the tensor layouts between different formats. It can be used to convert a tensor with an unsupported layout format to a tensor with a supported layout format.",
+	"cudnnActivationBackward":                                  "Input. Handle to a previously created cuDNN context. For more information, see cudnnHandle_t.",
+	"cudnnBatchNormalizationBackward":                          "For more information, see cudnnDeriveBNTensorDescriptor() for the secondary tensor descriptor generation for the parameters used in this function.",
+	"cudnnBatchNormalizationBackwardEx":                        "If workspace is NULL and workSpaceSizeInBytes of zero is passed in, this API will function exactly like the non-extended function cudnnBatchNormalizationBackward.",
+	"cudnnBatchNormalizationForwardTraining":                   "Handle to a previously created cuDNN library descriptor. For more information, see cudnnHandle_t.",
+	"cudnnBatchNormalizationForwardTrainingEx":                 "cudnnBatchNormalizationForwardTrainingEx is an extension of the cudnnBatchNormalizationForwardTraining() for performing the forward batch normalization layer computation.",
+	"cudnnDivisiveNormalizationBackward":                       "cudnnDivisiveNormalizationBackward performs the backward DivisiveNormalization layer computation.",
+	"cudnnDropoutBackward":                                     "cudnnDropoutBackward performs backward dropout operation over dy returning results in dx. If during forward dropout operation value from x was propagated to y then during backward operation value from dy will be propagated to dx, otherwise, dx value will be set to 0.",
+	"cudnnGetBatchNormalizationBackwardExWorkspaceSize":        "cudnnGetBatchNormalizationBackwardExWorkspaceSize returns the amount of GPU memory workspace the user should allocate to be able to call cudnnGetBatchNormalizationBackwardExWorkspaceSize() function for the specified bnOps input setting. The workspace allocated will then be passed to the function cudnnGetBatchNormalizationBackwardExWorkspaceSize().",
+	"cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize": "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize returns the amount of GPU memory workspace the user should allocate to be able to call cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize() function for the specified bnOps input setting. The workspace allocated should then be passed by the user to the function cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize().",
+	"cudnnGetBatchNormalizationTrainingExReserveSpaceSize":     "cudnnGetBatchNormalizationTrainingExReserveSpaceSize returns the amount of reserve GPU memory workspace the user should allocate for the batch normalization operation, for the specified bnOps input setting. In contrast to the workspace, the reserved space should be preserved between the forward and backward calls, and the data should not be altered.",
+	"cudnnGetNormalizationBackwardWorkspaceSize":               "cudnnGetNormalizationBackwardWorkspaceSize returns the amount of GPU memory workspace the user should allocate to be able to call cudnnNormalizationBackward() function for the specified normOps and algo input setting. The workspace allocated will then be passed to the function cudnnNormalizationBackward().",
+	"cudnnGetNormalizationForwardTrainingWorkspaceSize":        "cudnnGetNormalizationForwardTrainingWorkspaceSize returns the amount of GPU memory workspace the user should allocate to be able to call cudnnNormalizationForwardTraining() function for the specified normOps and algo input setting. The workspace allocated should then be passed by the user to the function cudnnNormalizationForwardTraining().",
+	"cudnnGetNormalizationTrainingReserveSpaceSize":            "cudnnGetNormalizationTrainingReserveSpaceSize returns the amount of reserve GPU memory workspace the user should allocate for the normalization operation, for the specified normOps input setting. In contrast to the workspace, the reserved space should be preserved between the forward and backward calls, and the data should not be altered.",
+	"cudnnLRNCrossChannelBackward":                             "cudnnLRNCrossChannelBackward performs the backward LRN layer computation.",
+	"cudnnNormalizationBackward":                               "The epsilon value has to be the same during training, backpropagation, and inference. cudnnNormalizationBackward workspace is not required to be clean. Moreover, the workspace does not have to remain unchanged between the forward and backward pass, as it is not used for passing any information.",
+	"cudnnNormalizationForwardTraining":                        "cudnnNormalizationForwardTraining workspace is not required to be clean. Moreover, the workspace does not have to remain unchanged between the forward and backward pass, as it is not used for passing any information. cudnnNormalizationForwardTraining extended function can accept a *workspace pointer to the GPU workspace, and workSpaceSizeInBytes, the size of the workspace, from the user.",
+	"cudnnOpsTrainVersionCheck":                                "cudnnOpsTrainVersionCheck checks whether the version of the OpsTrain subset of the library is consistent with the other sub-libraries.",
+	"cudnnPoolingBackward":                                     "cudnnPoolingBackward computes the gradient of a pooling operation.",
+	"cudnnSoftmaxBackward":                                     "cudnnSoftmaxBackward computes the gradient of the softmax function.",
+	"cudnnSpatialTfGridGeneratorBackward":                      "cudnnSpatialTfGridGeneratorBackward computes the gradient of a grid generation operation.",
+	"cudnnSpatialTfSamplerBackward":                            "cudnnSpatialTfSamplerBackward computes the gradient of a sampling operation.",
+	"cudnnBackendCreateDescriptor":                             "Input. One among the enumerated cudnnBackendDescriptorType_t.",
+	"cudnnBackendDestroyDescriptor":                            "cudnnBackendDestroyDescriptor destroys instances of cudnnBackendDescriptor_t that were previously created using cudnnBackendCreateDescriptor().",
+	"cudnnBackendExecute":                                      "The data and the working space are encapsulated in the VariantPack.",
+	"cudnnBackendFinalize":                                     "cudnnBackendFinalize finalizes the memory pointed to by the descriptor. The type of finalization is done depending on the descriptorType argument with which the descriptor was created using cudnnBackendCreate() or initialized using cudnnBackendInitialize().",
+	"cudnnBackendGetAttribute":                                 "cudnnBackendGetAttribute retrieves the value(s) of an attribute of a descriptor. attributeName is the name of the attribute whose value is requested. The attributeType is the type of attribute. requestsedElementCount is the number of elements to be potentially retrieved. The number of elements for the requested attribute is stored in elementCount. The retrieved values are stored in arrayOfElements. When the attribute is expected to have a single value, arrayOfElements can be pointer to the output value. cudnnBackendGetAttribute will return CUDNN_STATUS_NOT_INTIALIZED if the descriptor was already successfully finalized.",
+	"cudnnBackendInitialize":                                   "cudnnBackendInitialize repurposes a pre-allocated memory pointed to by a descriptor of size sizeInByte to a backend descriptor of type descriptorType. The necessary size for a descriptor type can be acquired by calling the function cudnnBackendGetSizeOf(). The finalized state of the descriptor is set to false.",
+	"cudnnBackendSetAttribute":                                 "cudnnBackendSetAttribute sets an attribute of a descriptor to value(s) provided as a pointer. descriptor is the descriptor to be set. attributeName is the name of the attribute to be set. attributeType is the type of attribute. The value to which the attribute is set, is pointed by the arrayOfElements. The number of elements is given by elementCount. cudnnBackendSetAttribute will return CUDNN_STATUS_NOT_INTIALIZED if the descriptor is already successfully finalized using cudnnBackendFinalize().",
+	"cudnnConvolutionBackwardData":                             "cudnnConvolutionBackwardData computes the convolution data gradient of the tensor dy, where y is the output of the forward convolution in cudnnConvolutionForward(). It uses the specified algo, and returns the results in the output tensor dx. Scaling factors alpha and beta can be used to scale the computed result or accumulate with the current dx.",
+	"cudnnConvolutionBiasActivationForward":                    "Input. Handle to a previously created cuDNN context. For more information, see cudnnHandle_t.",
+	"cudnnConvolutionForward":                                  "cudnnConvolutionForward executes convolutions or cross-correlations over x using filters specified with w, returning results in y. Scaling factors alpha and beta can be used to scale the input tensor and the output tensor respectively.",
+	"cudnnCreateConvolutionDescriptor":                         "cudnnCreateConvolutionDescriptor creates a convolution descriptor object by allocating the memory needed to hold its opaque structure. For more information, see cudnnConvolutionDescriptor_t.",
+	"cudnnDestroyConvolutionDescriptor":                        "cudnnDestroyConvolutionDescriptor destroys a previously created convolution descriptor object.",
+	"cudnnFindConvolutionBackwardDataAlgorithm":                "cudnnFindConvolutionBackwardDataAlgorithm attempts all algorithms available for cudnnConvolutionBackwardData(). It will attempt both the provided convDescmathType and CUDNN_DEFAULT_MATH (assuming the two differ).",
+	"cudnnFindConvolutionBackwardDataAlgorithmEx":              "cudnnFindConvolutionBackwardDataAlgorithmEx attempts all algorithms available for cudnnConvolutionBackwardData(). It will attempt both the provided convDescmathType and CUDNN_DEFAULT_MATH (assuming the two differ).",
+	"cudnnFindConvolutionForwardAlgorithm":                     "cudnnFindConvolutionForwardAlgorithm attempts all algorithms available for cudnnConvolutionForward(). It will attempt both the provided convDescmathType and CUDNN_DEFAULT_MATH (assuming the two differ).",
+	"cudnnFindConvolutionForwardAlgorithmEx":                   "cudnnFindConvolutionForwardAlgorithmEx attempts all algorithms available for cudnnConvolutionForward(). It will attempt both the provided convDescmathType and CUDNN_DEFAULT_MATH (assuming the two differ).",
+	"cudnnGetConvolution2dDescriptor":                          "cudnnGetConvolution2dDescriptor queries a previously initialized 2D convolution descriptor object.",
+	"cudnnGetConvolution2dForwardOutputDim":                    "cudnnGetConvolution2dForwardOutputDim returns the dimensions of the resulting 4D tensor of a 2D convolution, given the convolution descriptor, the input tensor descriptor and the filter descriptor cudnnGetConvolution2dForwardOutputDim can help to setup the output tensor and allocate the proper amount of memory prior to launch the actual convolution.",
+	"cudnnGetConvolutionBackwardDataAlgorithmMaxCount":         "cudnnGetConvolutionBackwardDataAlgorithmMaxCount returns the maximum number of algorithms which can be returned from cudnnFindConvolutionBackwardDataAlgorithm() and cudnnGetConvolutionForwardAlgorithm_v7(). cudnnGetConvolutionBackwardDataAlgorithmMaxCount is the sum of all algorithms plus the sum of all algorithms with Tensor Core operations supported for the current device.",
+	"cudnnGetConvolutionBackwardDataAlgorithm_v7":              "cudnnGetConvolutionBackwardDataAlgorithm_v7 serves as a heuristic for obtaining the best suited algorithm for cudnnConvolutionBackwardData() for the given layer specifications. cudnnGetConvolutionBackwardDataAlgorithm_v7 will return all algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) sorted by expected (based on internal heuristic) relative performance with the fastest being index 0 of perfResults. For an exhaustive search for the fastest algorithm, use cudnnFindConvolutionBackwardDataAlgorithm(). The total number of resulting algorithms can be queried through the returnedAlgoCount variable.",
+	"cudnnGetConvolutionBackwardDataWorkspaceSize":             "cudnnGetConvolutionBackwardDataWorkspaceSize returns the amount of GPU memory workspace the user needs to allocate to be able to call cudnnConvolutionBackwardData() with the specified algorithm. The workspace allocated will then be passed to the routine cudnnConvolutionBackwardData(). The specified algorithm can be the result of the call to cudnnGetConvolutionBackwardDataAlgorithm_v7() or can be chosen arbitrarily by the user. Note that not every algorithm is available for every configuration of the input tensor and/or every configuration of the convolution descriptor.",
+	"cudnnGetConvolutionForwardAlgorithmMaxCount":              "cudnnGetConvolutionForwardAlgorithmMaxCount returns the maximum number of algorithms which can be returned from cudnnFindConvolutionForwardAlgorithm() and cudnnGetConvolutionForwardAlgorithm_v7(). cudnnGetConvolutionForwardAlgorithmMaxCount is the sum of all algorithms plus the sum of all algorithms with Tensor Core operations supported for the current device.",
+	"cudnnGetConvolutionForwardAlgorithm_v7":                   "cudnnGetConvolutionForwardAlgorithm_v7 serves as a heuristic for obtaining the best suited algorithm for cudnnConvolutionForward() for the given layer specifications. cudnnGetConvolutionForwardAlgorithm_v7 will return all algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) sorted by expected (based on internal heuristic) relative performance with the fastest being index 0 of perfResults. For an exhaustive search for the fastest algorithm, use cudnnFindConvolutionForwardAlgorithm(). The total number of resulting algorithms can be queried through the returnedAlgoCount variable.",
+	"cudnnGetConvolutionForwardWorkspaceSize":                  "cudnnGetConvolutionForwardWorkspaceSize returns the amount of GPU memory workspace the user needs to allocate to be able to call cudnnConvolutionForward() with the specified algorithm. The workspace allocated will then be passed to the routine cudnnConvolutionForward(). The specified algorithm can be the result of the call to cudnnGetConvolutionForwardAlgorithm_v7() or can be chosen arbitrarily by the user. Note that not every algorithm is available for every configuration of the input tensor and/or every configuration of the convolution descriptor.",
+	"cudnnGetConvolutionGroupCount":                            "cudnnGetConvolutionGroupCount returns the group count specified in the given convolution descriptor.",
+	"cudnnGetConvolutionMathType":                              "cudnnGetConvolutionMathType returns the math type specified in a given convolution descriptor.",
+	"cudnnGetConvolutionNdDescriptor":                          "cudnnGetConvolutionNdDescriptor queries a previously initialized convolution descriptor object.",
+	"cudnnGetConvolutionNdForwardOutputDim":                    "cudnnGetConvolutionNdForwardOutputDim returns the dimensions of the resulting n-D tensor of a nbDims-2-D convolution, given the convolution descriptor, the input tensor descriptor and the filter descriptor cudnnGetConvolutionNdForwardOutputDim can help to setup the output tensor and allocate the proper amount of memory prior to launch the actual convolution.",
+	"cudnnGetConvolutionReorderType":                           "cudnnGetConvolutionReorderType retrieves the convolution reorder type from the given convolution descriptor.",
+	"cudnnGetFoldedConvBackwardDataDescriptors":                "cudnnGetFoldedConvBackwardDataDescriptors calculates folding descriptors for backward data gradient. It takes as input the data descriptors along with convolution descriptor and computes the folded data descriptors and the folding transform descriptors. These can then be used to do the actual folding transform.",
+	"cudnnIm2Col":                                              "Input. Handle to a previously created cuDNN context.",
+	"cudnnReorderFilterAndBias":                                "cudnnReorderFilterAndBias cudnnReorderFilterAndBias() reorders the filter and bias values. It can be used to enhance the inference time by separating the reordering operation from convolution.",
+	"cudnnSetConvolution2dDescriptor":                          "cudnnSetConvolution2dDescriptor initializes a previously created convolution descriptor object into a 2D correlation. cudnnSetConvolution2dDescriptor assumes that the tensor and filter descriptors correspond to the forward convolution path and checks if their settings are valid. That same convolution descriptor can be reused in the backward path provided it corresponds to the same layer.",
+	"cudnnSetConvolutionGroupCount":                            "cudnnSetConvolutionGroupCount allows the user to specify the number of groups to be used in the associated convolution.",
+	"cudnnSetConvolutionMathType":                              "cudnnSetConvolutionMathType allows the user to specify whether or not the use of tensor op is permitted in the library routines associated with a given convolution descriptor.",
+	"cudnnSetConvolutionNdDescriptor":                          "cudnnSetConvolutionNdDescriptor initializes a previously created generic convolution descriptor object into a n-D correlation. That same convolution descriptor can be reused in the backward path provided it corresponds to the same layer. The convolution computation will be done in the specified dataType, which can be potentially different from the input/output tensors.",
+	"cudnnSetConvolutionReorderType":                           "cudnnSetConvolutionReorderType sets the convolution reorder type for the given convolution descriptor.",
+	"cudnnConvolutionBackwardBias":                             "cudnnConvolutionBackwardBias computes the convolution function gradient with respect to the bias, which is the sum of every element belonging to the same feature map across all of the images of the input tensor. Therefore, the number of elements produced is equal to the number of features maps of the input tensor.",
+	"cudnnConvolutionBackwardFilter":                           "cudnnConvolutionBackwardFilter computes the convolution weight (filter) gradient of the tensor dy, where y is the output of the forward convolution in cudnnConvolutionForward(). It uses the specified algo, and returns the results in the output tensor dw. Scaling factors alpha and beta can be used to scale the computed result or accumulate with the current dw.",
+	"cudnnCreateFusedOpsConstParamPack":                        "cudnnCreateFusedOpsConstParamPack creates an opaque structure to store the various problem size information, such as the shape, layout and the type of tensors, and the descriptors for convolution and activation, for the selected sequence of cudnnFusedOps computations.",
+	"cudnnCreateFusedOpsPlan":                                  "cudnnCreateFusedOpsPlan creates the plan descriptor for the cudnnFusedOps computation. cudnnCreateFusedOpsPlan descriptor contains the plan information, including the problem type and size, which kernels should be run, and the internal workspace partition.",
+	"cudnnCreateFusedOpsVariantParamPack":                      "cudnnCreateFusedOpsVariantParamPack creates a descriptor for cudnnFusedOps constant parameters.",
+	"cudnnDestroyFusedOpsConstParamPack":                       "cudnnDestroyFusedOpsConstParamPack destroys a previously-created cudnnFusedOpsConstParamPack_t structure.",
+	"cudnnDestroyFusedOpsPlan":                                 "cudnnDestroyFusedOpsPlan destroys the plan descriptor provided.",
+	"cudnnDestroyFusedOpsVariantParamPack":                     "cudnnDestroyFusedOpsVariantParamPack destroys a previously-created descriptor for cudnnFusedOps constant parameters.",
+	"cudnnFindConvolutionBackwardFilterAlgorithm":              "cudnnFindConvolutionBackwardFilterAlgorithm attempts all algorithms available for cudnnConvolutionBackwardFilter(). It will attempt both the provided convDescmathType and CUDNN_DEFAULT_MATH (assuming the two differ).",
+	"cudnnFindConvolutionBackwardFilterAlgorithmEx":            "cudnnFindConvolutionBackwardFilterAlgorithmEx attempts all algorithms available for cudnnConvolutionBackwardFilter(). It will attempt both the provided convDescmathType and CUDNN_DEFAULT_MATH (assuming the two differ).",
+	"cudnnFusedOpsExecute":                                     "cudnnFusedOpsExecute executes the sequence of cudnnFusedOps operations.",
+	"cudnnGetConvolutionBackwardFilterAlgorithmMaxCount":       "cudnnGetConvolutionBackwardFilterAlgorithmMaxCount returns the maximum number of algorithms which can be returned from cudnnFindConvolutionBackwardFilterAlgorithm() and cudnnGetConvolutionForwardAlgorithm_v7(). cudnnGetConvolutionBackwardFilterAlgorithmMaxCount is the sum of all algorithms plus the sum of all algorithms with Tensor Core operations supported for the current device.",
+	"cudnnGetConvolutionBackwardFilterAlgorithm_v7":            "cudnnGetConvolutionBackwardFilterAlgorithm_v7 serves as a heuristic for obtaining the best suited algorithm for cudnnConvolutionBackwardFilter() for the given layer specifications. cudnnGetConvolutionBackwardFilterAlgorithm_v7 will return all algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) sorted by expected (based on internal heuristic) relative performance with fastest being index 0 of perfResults. For an exhaustive search for the fastest algorithm, use cudnnFindConvolutionBackwardFilterAlgorithm(). The total number of resulting algorithms can be queried through the returnedAlgoCount variable.",
+	"cudnnGetConvolutionBackwardFilterWorkspaceSize":           "cudnnGetConvolutionBackwardFilterWorkspaceSize returns the amount of GPU memory workspace the user needs to allocate to be able to call cudnnConvolutionBackwardFilter() with the specified algorithm. The workspace allocated will then be passed to the routine cudnnConvolutionBackwardFilter(). The specified algorithm can be the result of the call to cudnnGetConvolutionBackwardFilterAlgorithm_v7() or can be chosen arbitrarily by the user. Note that not every algorithm is available for every configuration of the input tensor and/or every configuration of the convolution descriptor.",
+	"cudnnGetFusedOpsConstParamPackAttribute":                  "cudnnGetFusedOpsConstParamPackAttribute retrieves the values of the descriptor pointed to by the param pointer input. The type of the descriptor is indicated by the enum value of paramLabel input.",
+	"cudnnGetFusedOpsVariantParamPackAttribute":                "cudnnGetFusedOpsVariantParamPackAttribute retrieves the settings of the variable parameter pack descriptor.",
+	"cudnnMakeFusedOpsPlan":                                    "cudnnMakeFusedOpsPlan determines the optimum kernel to execute, and the workspace size the user should allocate, prior to the actual execution of the fused operations by cudnnFusedOpsExecute().",
+	"cudnnSetFusedOpsConstParamPackAttribute":                  "cudnnSetFusedOpsConstParamPackAttribute sets the descriptor pointed to by the param pointer input. The type of the descriptor to be set is indicated by the enum value of the paramLabel input.",
+	"cudnnSetFusedOpsVariantParamPackAttribute":                "cudnnSetFusedOpsVariantParamPackAttribute sets the variable parameter pack descriptor.",
+	"cudnnAdvInferVersionCheck":                                "cudnnAdvInferVersionCheck checks to see whether the version of the AdvInfer subset of the library is consistent with the other sub-libraries.",
+	"cudnnBuildRNNDynamic":                                     "cudnnBuildRNNDynamic compiles the RNN persistent code using CUDA runtime compilation library (NVRTC) when the CUDNN_RNN_ALGO_PERSIST_DYNAMIC algo is selected. The code is tailored to the current GPU and specific hyperparameters (miniBatch). cudnnBuildRNNDynamic call is expected to be expensive in terms of runtime and should be invoked infrequently. Note that the CUDNN_RNN_ALGO_PERSIST_DYNAMIC algo does not support variable length sequences within the batch.",
+	"cudnnCreateAttnDescriptor":                                "cudnnCreateAttnDescriptor creates one instance of an opaque attention descriptor object by allocating the host memory for it and initializing all descriptor fields. The function writes NULL to attnDesc when the attention descriptor object cannot be allocated.",
+	"cudnnCreatePersistentRNNPlan":                             "cudnnCreatePersistentRNNPlan has been deprecated in cuDNN 8.0. Use cudnnBuildRNNDynamic() instead of cudnnCreatePersistentRNNPlan().",
+	"cudnnCreateRNNDataDescriptor":                             "cudnnCreateRNNDataDescriptor creates a RNN data descriptor object by allocating the memory needed to hold its opaque structure.",
+	"cudnnCreateRNNDescriptor":                                 "cudnnCreateRNNDescriptor creates a generic RNN descriptor object by allocating the memory needed to hold its opaque structure.",
+	"cudnnCreateSeqDataDescriptor":                             "cudnnCreateSeqDataDescriptor creates one instance of an opaque sequence data descriptor object by allocating the host memory for it and initializing all descriptor fields. The function writes NULL to seqDataDesc when the sequence data descriptor object cannot be allocated.",
+	"cudnnDestroyAttnDescriptor":                               "cudnnDestroyAttnDescriptor destroys the attention descriptor object and releases its memory. The attnDesc argument can be NULL. Invoking cudnnDestroyAttnDescriptor() with a NULL argument is a no operation (NOP).",
+	"cudnnDestroyPersistentRNNPlan":                            "cudnnDestroyPersistentRNNPlan destroys a previously created persistent RNN plan object.",
+	"cudnnDestroyRNNDataDescriptor":                            "cudnnDestroyRNNDataDescriptor destroys a previously created RNN data descriptor object.",
+	"cudnnDestroyRNNDescriptor":                                "cudnnDestroyRNNDescriptor destroys a previously created RNN descriptor object.",
+	"cudnnDestroySeqDataDescriptor":                            "cudnnDestroySeqDataDescriptor destroys the sequence data descriptor object and releases its memory. The seqDataDesc argument can be NULL. Invoking cudnnDestroySeqDataDescriptor() with a NULL argument is a no operation (NOP).",
+	"cudnnFindRNNForwardInferenceAlgorithmEx":                  "cudnnFindRNNForwardInferenceAlgorithmEx attempts all available cuDNN algorithms for cudnnRNNForwardInference(), using user-allocated GPU memory. It outputs the parameters that influence the performance of the algorithm to a user-allocated array of cudnnAlgorithmPerformance_t. These parameter metrics are written in sorted fashion where the first element has the lowest compute time.",
+	"cudnnGetAttnDescriptor":                                   "cudnnGetAttnDescriptor retrieves settings from the previously created attention descriptor. The user can assign NULL to any pointer except attnDesc when the retrieved value is not needed.",
+	"cudnnGetMultiHeadAttnBuffers":                             "The user must allocate weight, work, and reserve space buffer sizes in the GPU memory using cudaMalloc() with the reported buffer sizes. The buffers can be also carved out from a larger chunk of allocated memory but the buffer addresses must be at least 16B aligned.",
+	"cudnnGetMultiHeadAttnWeights":                             "cudnnGetMultiHeadAttnWeights obtains the shape of the weight or bias tensor. It also retrieves the start address of tensor data located in the weight buffer. Use the wKind argument to select a particular tensor. For more information, see cudnnMultiHeadAttnWeightKind_t for the description of the enumerant type.",
+	"cudnnGetRNNBiasMode":                                      "cudnnGetRNNBiasMode has been deprecated in cuDNN 8.0. Use cudnnGetRNNDescriptor_v8() instead of cudnnGetRNNBiasMode()",
+	"cudnnGetRNNDataDescriptor":                                "cudnnGetRNNDataDescriptor retrieves a previously created RNN data descriptor object.",
+	"cudnnGetRNNDescriptor_v6":                                 "cudnnGetRNNDescriptor_v6 has been deprecated in cuDNN 8.0. Use cudnnGetRNNDescriptor_v8() instead of cudnnGetRNNDescriptor_v6().",
+	"cudnnGetRNNDescriptor_v8":                                 "cudnnGetRNNDescriptor_v8 retrieves RNN network parameters that were configured by cudnnSetRNNDescriptor_v8(). The user can assign NULL to any pointer except rnnDesc when the retrieved value is not needed. The function does not check the validity of retrieved parameters.",
+	"cudnnGetRNNLinLayerBiasParams":                            "cudnnGetRNNLinLayerBiasParams has been deprecated in cuDNN 8.0. Use cudnnGetRNNWeightParams() instead of cudnnGetRNNLinLayerBiasParams().",
+	"cudnnGetRNNLinLayerMatrixParams":                          "cudnnGetRNNLinLayerMatrixParams has been deprecated in cuDNN 8.0 . Use cudnnGetRNNWeightParams() instead of cudnnGetRNNLinLayerMatrixParams().",
+	"cudnnGetRNNMatrixMathType":                                "cudnnGetRNNMatrixMathType has been deprecated in cuDNN 8.0. Use cudnnGetRNNDescriptor_v8() instead of cudnnGetRNNMatrixMathType().",
+	"cudnnGetRNNPaddingMode":                                   "cudnnGetRNNPaddingMode has been deprecated in cuDNN 8.0. Use cudnnGetRNNDescriptor_v8() instead of cudnnGetRNNPaddingMode().",
+	"cudnnGetRNNParamsSize":                                    "cudnnGetRNNParamsSize has been deprecated in cuDNN 8.0. Use cudnnGetRNNWeightSpaceSize() instead of cudnnGetRNNParamsSize().",
+	"cudnnGetRNNProjectionLayers":                              "cudnnGetRNNProjectionLayers has been deprecated in cuDNN 8.0. Use cudnnGetRNNDescriptor_v8() instead of cudnnGetRNNProjectionLayers().",
+	"cudnnGetRNNTempSpaceSizes":                                "cudnnGetRNNTempSpaceSizes computes the work and reserve space buffer sizes based on the RNN network geometry stored in rnnDesc, designated usage (inference or training) defined by the fMode argument, and the current RNN data dimensions (maxSeqLength, batchSize) retrieved from xDesc. When RNN data dimensions change, the cudnnGetRNNTempSpaceSizes() must be called again because RNN temporary buffer sizes are not monotonic.",
+	"cudnnGetRNNWeightParams":                                  "cudnnGetRNNWeightParams is used to obtain the start address and shape of every RNN weight matrix and bias vector in each pseudo-layer within the recurrent network.",
+	"cudnnGetRNNWeightSpaceSize":                               "cudnnGetRNNWeightSpaceSize reports the required size of the weight space buffer in bytes. The weight space buffer holds all RNN weight matrices and bias vectors.",
+	"cudnnGetRNNWorkspaceSize":                                 "cudnnGetRNNWorkspaceSize has been deprecated in cuDNN 8.0. Use cudnnGetRNNTempSpaceSizes() instead of cudnnGetRNNWorkspaceSize().",
+	"cudnnGetSeqDataDescriptor":                                "cudnnGetSeqDataDescriptor retrieves settings from a previously created sequence data descriptor. The user can assign NULL to any pointer except seqDataDesc when the retrieved value is not needed. The nbDimsRequested argument applies to both dimA[] and axes[] arrays. A positive value of nbDimsRequested or seqLengthSizeRequested is ignored when the corresponding array, dimA[], axes[], or seqLengthArray[] is NULL.",
+	"cudnnMultiHeadAttnForward":                                "The cudnnMultiHeadAttnForward() function computes the forward responses of the multi-head attention layer. When reserveSpaceSizeInBytes=0 and reserveSpace=NULL, the function operates in the inference mode in which backward (gradient) functions are not invoked, otherwise, the training mode is assumed. In the training mode, the reserve space is used to pass intermediate results from cudnnMultiHeadAttnForward() to cudnnMultiHeadAttnBackwardData() and from cudnnMultiHeadAttnBackwardData() to cudnnMultiHeadAttnBackwardWeights().",
+	"cudnnRNNForward":                                          "cudnnRNNForward computes the forward response of the recurrent neural network described by rnnDesc with inputs in x, hx, cx, and weights/biases in the weightSpace buffer. RNN outputs are written to y, hy, and cy buffers. Locations of x, y, hx, cx, hy, and cy signals in the multi-layer RNN model are shown in the Figure below. Note that internal RNN signals between time-steps and between layers are not exposed to the user.",
+	"cudnnRNNForwardInference":                                 "cudnnRNNForwardInference has been deprecated in cuDNN 8.0. Use cudnnRNNForward() instead of cudnnRNNForwardInference().",
+	"cudnnRNNForwardInferenceEx":                               "cudnnRNNForwardInferenceEx has been deprecated in cuDNN 8.0. Use cudnnRNNForward() instead of cudnnRNNForwardInference().",
+	"cudnnRNNGetClip":                                          "cudnnRNNGetClip has been deprecated in cuDNN 8.0. Use cudnnRNNGetClip_v8() instead of cudnnRNNGetClip().",
+	"cudnnRNNGetClip_v8":                                       "Retrieves the current LSTM cell clipping parameters, and stores them in the arguments provided. The user can assign NULL to any pointer except rnnDesc when the retrieved value is not needed. The function does not check the validity of retrieved parameters.",
+	"cudnnRNNSetClip":                                          "cudnnRNNSetClip has been deprecated in cuDNN 8.0. Use cudnnRNNSetClip_v8() instead of cudnnRNNSetClip().",
+	"cudnnRNNSetClip_v8":                                       "Sets the LSTM cell clipping mode. The LSTM clipping is disabled by default. When enabled, clipping is applied to all layers. cudnnRNNSetClip_v8 cudnnRNNSetClip() function does not affect the work, reserve, and weight-space buffer sizes and may be called multiple times.",
+	"cudnnSetAttnDescriptor":                                   "cudnnSetAttnDescriptor configures a multi-head attention descriptor that was previously created using the cudnnCreateAttnDescriptor() function. The function sets attention parameters that are necessary to compute internal buffer sizes, dimensions of weight and bias tensors, or to select optimized code paths.",
+	"cudnnSetPersistentRNNPlan":                                "cudnnSetPersistentRNNPlan sets the persistent RNN plan to be executed when using rnnDesc and CUDNN_RNN_ALGO_PERSIST_DYNAMIC algo.",
+	"cudnnSetRNNBiasMode":                                      "cudnnSetRNNBiasMode has been deprecated in cuDNN 8.0. Use cudnnSetRNNDescriptor_v8() instead of cudnnSetRNNBiasMode().",
+	"cudnnSetRNNDataDescriptor":                                "cudnnSetRNNDataDescriptor initializes a previously created RNN data descriptor object. cudnnSetRNNDataDescriptor data structure is intended to support the unpacked (padded) layout for input and output of extended RNN inference and training functions. A packed (unpadded) layout is also supported for backward compatibility.",
+	"cudnnSetRNNDescriptor_v6":                                 "cudnnSetRNNDescriptor_v6 has been deprecated in cuDNN 8.0. Use cudnnSetRNNDescriptor_v8() instead of cudnnSetRNNDescriptor_v6().",
+	"cudnnSetRNNDescriptor_v8":                                 "cudnnSetRNNDescriptor_v8 initializes a previously created RNN descriptor object. The RNN descriptor configured by cudnnSetRNNDescriptor_v8() was enhanced to store all information needed to compute the total number of adjustable weights/biases in the RNN model.",
+	"cudnnSetRNNMatrixMathType":                                "cudnnSetRNNMatrixMathType has been deprecated in cuDNN 8.0. Use cudnnSetRNNDescriptor_v8() instead of cudnnSetRNNMatrixMathType().",
+	"cudnnSetRNNPaddingMode":                                   "cudnnSetRNNPaddingMode has been deprecated in cuDNN 8.0. Use cudnnSetRNNDescriptor_v8() instead of cudnnSetRNNPaddingMode().",
+	"cudnnSetRNNProjectionLayers":                              "cudnnSetRNNProjectionLayers has been deprecated in cuDNN 8.0. Use cudnnSetRNNDescriptor_v8() instead of cudnnSetRNNProjectionLayers().",
+	"cudnnSetSeqDataDescriptor":                                "For example, to express information that vectors in our sequence data buffer are five elements long, we need to assign dimA[CUDNN_SEQDATA_VECT_DIM]=5 in the dimA[] array.",
+	"cudnnAdvTrainVersionCheck":                                "cudnnAdvTrainVersionCheck checks whether the version of the AdvTrain subset of the library is consistent with the other sub-libraries.",
+	"cudnnCreateCTCLossDescriptor":                             "cudnnCreateCTCLossDescriptor creates a CTC loss function descriptor.",
+	"cudnnCTCLoss":                                             "Input. Handle to a previously created cuDNN context. For more information, see cudnnHandle_t.",
+	"cudnnCTCLoss_v8":                                          "Input. Handle to a previously created cuDNN context. For more information, see cudnnHandle_t.",
+	"cudnnDestroyCTCLossDescriptor":                            "cudnnDestroyCTCLossDescriptor destroys a CTC loss function descriptor object.",
+	"cudnnFindRNNBackwardDataAlgorithmEx":                      "cudnnFindRNNBackwardDataAlgorithmEx attempts all available cuDNN algorithms for cudnnRNNBackwardData(), using user-allocated GPU memory. It outputs the parameters that influence the performance of the algorithm to a user-allocated array of cudnnAlgorithmPerformance_t. These parameter metrics are written in sorted fashion where the first element has the lowest compute time.",
+	"cudnnFindRNNBackwardWeightsAlgorithmEx":                   "cudnnFindRNNBackwardWeightsAlgorithmEx attempts all available cuDNN algorithms for cudnnRNNBackwardWeights(), using user-allocated GPU memory. It outputs the parameters that influence the performance of the algorithm to a user-allocated array of cudnnAlgorithmPerformance_t. These parameter metrics are written in sorted fashion where the first element has the lowest compute time.",
+	"cudnnFindRNNForwardTrainingAlgorithmEx":                   "cudnnFindRNNForwardTrainingAlgorithmEx attempts all available cuDNN algorithms for cudnnRNNForwardTraining(), using user-allocated GPU memory. It outputs the parameters that influence the performance of the algorithm to a user-allocated array of cudnnAlgorithmPerformance_t. These parameter metrics are written in sorted fashion where the first element has the lowest compute time.",
+	"cudnnGetCTCLossDescriptor":                                "cudnnGetCTCLossDescriptor returns the configuration of the passed CTC loss function descriptor.",
+	"cudnnGetCTCLossDescriptorEx":                              "cudnnGetCTCLossDescriptorEx returns the configuration of the passed CTC loss function descriptor.",
+	"cudnnGetCTCLossDescriptor_v8":                             "cudnnGetCTCLossDescriptor_v8 returns the configuration of the passed CTC loss function descriptor.",
+	"cudnnGetCTCLossWorkspaceSize":                             "cudnnGetCTCLossWorkspaceSize returns the amount of GPU memory workspace the user needs to allocate to be able to call cudnnCTCLoss() with the specified algorithm. The workspace allocated will then be passed to the routine cudnnCTCLoss().",
+	"cudnnGetCTCLossWorkspaceSize_v8":                          "cudnnGetCTCLossWorkspaceSize_v8 returns the amount of GPU memory workspace the user needs to allocate to be able to call cudnnCTCLoss_v8() with the specified algorithm. The workspace allocated will then be passed to the routine cudnnCTCLoss_v8().",
+	"cudnnGetRNNTrainingReserveSize":                           "cudnnGetRNNTrainingReserveSize has been deprecated in cuDNN 8.0. Use cudnnGetRNNTempSpaceSizes() instead of cudnnGetRNNWorkspaceSize().",
+	"cudnnMultiHeadAttnBackwardData":                           "cudnnMultiHeadAttnBackwardData computes exact, first-order derivatives of the multi-head attention block with respect to its inputs: Q, K, V. If y=F(x) is a vector-valued function that represents the multi-head attention layer and it takes some vector x Ïµ â n as an input (with all other parameters and inputs constant), and outputs vector y Ïµ â m , then cudnnMultiHeadAttnBackwardData() computes the result of â y i / â x j T Î´ out where Î´ out is the m Ã 1 gradient of the loss function with respect to multi-head attention outputs. The Î´ out gradient is back propagated through prior layers of the deep learning model. â y i / â x j is the m Ã n Jacobian matrix of F(x). The input is supplied via the dout argument and gradient results for Q, K, V are written to the dqueries, dkeys, and dvalues buffers.",
+	"cudnnMultiHeadAttnBackwardWeights":                        "cudnnMultiHeadAttnBackwardWeights computes exact, first-order derivatives of the multi-head attention block with respect to its trainable parameters: projection weights and projection biases. If y=F(w) is a vector-valued function that represents the multi-head attention layer and it takes some vector x Ïµ â n of flatten weights or biases as an input (with all other parameters and inputs fixed), and outputs vector y Ïµ â m , then cudnnMultiHeadAttnBackwardWeights() computes the result of â y i / â x j T Î´ out where Î´ out is the m Ã 1 gradient of the loss function with respect to multi-head attention outputs. The Î´ out gradient is back propagated through prior layers of the deep learning model. â y i / â x j is the m Ã n Jacobian matrix of F(w). The Î´ out input is supplied via the dout argument.",
+	"cudnnRNNBackwardData":                                     "cudnnRNNBackwardData has been deprecated in cuDNN 8.0. Use cudnnRNNBackwardData_v8() instead of cudnnRNNBackwardData().",
+	"cudnnRNNBackwardData_v8":                                  "cudnnRNNBackwardData_v8 computes exact, first-order derivatives of the RNN model with respect to its inputs: x, hx and for the LSTM cell typealsocx. If o = [y, hy, cy] = F(x, hx, cx) = F(z) is a vector-valued function that represents the entire RNN model and it takes vectors x(for all time-steps) and vectors hx, cx (for all layers) as inputs, concatenated into z Ïµ â n (network weights and biases are assumed constant), and outputs vectors y, hy, cy concatenated into a vector o Ïµ â m , then cudnnRNNBackwardData_v8() computes the result of â o i / â z j T Î´ out where Î´ out is the m Ã 1 gradient of the loss function with respect to all RNN outputs. The Î´ out gradient is back propagated through prior layers of the deep learning model, starting from the model output. â o i / â z j is the m Ã n Jacobian matrix of F(z). The Î´ out input is supplied via the dy, dhy, and dcy arguments and gradient results â o i / â z j T Î´ out are written to the dx, dhx, and dcx buffers.",
+	"cudnnRNNBackwardDataEx":                                   "cudnnRNNBackwardDataEx has been deprecated in cuDNN 8.0. Use cudnnRNNBackwardData_v8 instead of cudnnRNNBackwardDataEx().",
+	"cudnnRNNBackwardWeights":                                  "cudnnRNNBackwardWeights has been deprecated in cuDNN 8.0. Use cudnnRNNBackwardWeights_v8() instead of cudnnRNNBackwardWeights().",
+	"cudnnRNNBackwardWeights_v8":                               "cudnnRNNBackwardWeights_v8 computes exact, first-order derivatives of the RNN model with respect to all trainable parameters: weights and biases. If o = [y, hy, cy] = F(w) is a vector-valued function that represents the multi-layer RNN model and it takes some vector w Ïµ â n of `flatten` weights or biases as input (with all other data inputs constant), and outputs vector o Ïµ â m , then cudnnRNNBackwardWeights_v8() computes the result of â o i / â w j T Î´ out where Î´ out is the m Ã 1 gradient of the loss function with respect to all RNN outputs. The Î´ out gradient is back propagated through prior layers of the deep learning model, starting from the model output. â o i / â w j is the m Ã n Jacobian matrix of F(w). The Î´ out input is supplied via the dy, dhy, and dcy arguments in the cudnnRNNBackwardData_v8() function.",
+	"cudnnRNNBackwardWeightsEx":                                "cudnnRNNBackwardWeightsEx has been deprecated in cuDNN 8.0. Use cudnnRNNBackwardWeights_v8() instead of cudnnRNNBackwardWeightsEX().",
+	"cudnnRNNForwardTraining":                                  "Use cudnnRNNForward() instead of cudnnRNNForwardTraining().",
+	"cudnnRNNForwardTrainingEx":                                "cudnnRNNForwardTrainingEx has been deprecated starting in cuDNN 8.0. Use cudnnRNNForward() instead of cudnnRNNForwardTrainingEx().",
+	"cudnnSetCTCLossDescriptor":                                "cudnnSetCTCLossDescriptor sets a CTC loss function descriptor. See also the extended version cudnnSetCTCLossDescriptorEx() to set the input normalization mode.",
+	"cudnnSetCTCLossDescriptorEx":                              "cudnnSetCTCLossDescriptorEx is an extension of cudnnSetCTCLossDescriptor(). cudnnSetCTCLossDescriptorEx provides an additional interface normMode to set the input normalization mode for the CTC loss function, and gradMode to control the NaN propagation type.",
+	"cudnnSetCTCLossDescriptor_v8":                             "Many CTC API functions are updated in cuDNN version 8.0.0 to support CUDA graphs. In order to do so, a new parameter is needed, maxLabelLength. Now that label and input data are assumed to be in GPU memory, this information is not otherwise readily available.",
 }
diff --git a/cmd/gencudnn/params.go b/cmd/gencudnn/params.go
index 875082f..2b4b7be 100644
--- a/cmd/gencudnn/params.go
+++ b/cmd/gencudnn/params.go
@@ -1,8 +1,8 @@
 package main
 
 import (
-	"github.com/cznic/cc"
 	bg "github.com/gorgonia/bindgen"
+	"modernc.org/cc"
 )
 
 func isInput(fnName string, p bg.Parameter) bool {
diff --git a/cmd/gencudnn/parse.go b/cmd/gencudnn/parse.go
index 9ca2107..c978bba 100644
--- a/cmd/gencudnn/parse.go
+++ b/cmd/gencudnn/parse.go
@@ -4,8 +4,8 @@ import (
 	"fmt"
 	"strings"
 
-	"github.com/cznic/cc"
 	"github.com/gorgonia/bindgen"
+	"modernc.org/cc"
 )
 
 // Functions returns the C function declarations in the givel set of file paths.
@@ -113,6 +113,18 @@ func processEnumName(lcp, name string) string {
 		return "PrecomputedMeans"
 	case "CUDNN_SAMPLER_BILINEAR":
 		return "Bilinear"
+	case "CUDNN_PTR_16B_ALIGNED": // processing would yield `16B_Aligned`, which is not a valid Go name
+		return "Ptr16"
+	case "CUDNN_PTR_NULL":
+		return "NullPtr"
+	case "CUDNN_PTR_ELEM_ALIGNED":
+		return "PtrElemAligned"
+	case "CUDNN_BATCHNORM_OPS_BN":
+		return "BatchNorm" //  name == lcp otherwise
+	case "CUDNN_GENSTATS_SUM_SQSUM":
+		return "SumSq" // it is the only enum in the list, so name == lcp
+	case "CUDNN_NORM_OPS_NORM":
+		return "Norm" // name == lcp otherwise
 	}
 
 	var trimmed string
@@ -123,6 +135,34 @@ func processEnumName(lcp, name string) string {
 	lowered := strings.ToLower(trimmed)
 
 	switch lcp {
+	case "CUDNN_RNN_CLIP_":
+		lowered = "RNNClip" + strings.Title(lowered)
+	case "CUDNN_RNN_":
+		lowered = "RNN" + strings.Title(lowered)
+	case "CUDNN_RNN_ALGO_":
+		lowered = strings.Title(lowered) + "RNN"
+	case "CUDNN_POINTWISE_":
+		lowered = "Pointwise" + strings.Title(lowered)
+	case "CUDNN_OP_TENSOR_":
+		lowered = "Tensor" + strings.Title(lowered)
+	case "CUDNN_NORM_OPS_NORM":
+		lowered = "Norm" + strings.Title(lowered)
+	case "CUDNN_NORM_PER_":
+		lowered = "NormPer" + strings.Title(lowered)
+	case "CUDNN_NORM_ALGO_":
+		lowered = strings.Title(lowered) + "Norm"
+	case "CUDNN_LOSS_NORMALIZATION_":
+		lowered = "LossNorm" + strings.Title(lowered)
+	case "CUDNN_BATCHNORM_OPS_BN":
+		lowered = "BatchNorm" + strings.Title(lowered)
+	case "CUDNN_LAYOUT_TYPE_":
+		lowered = "BELayout" + strings.Title(lowered)
+	case "CUDNN_BACKEND_":
+		lowered = "BEDescriptor" + strings.Title(lowered)
+	case "CUDNN_ATTR_":
+		lowered = "BEAttrName" + strings.Title(lowered)
+	case "CUDNN_TYPE_":
+		lowered = "BEAttr" + strings.Title(lowered)
 	case "CUDNN_TENSOR_N":
 		// tensor description
 		lowered = "n" + lowered
@@ -134,6 +174,7 @@ func processEnumName(lcp, name string) string {
 	case "CUDNN_CTC_LOSS_ALGO_":
 		// CTC Loss Algorithms
 		lowered = lowered + "CTCLoss"
+	case "CUDNN_PTR_":
 	default:
 	}
 
@@ -231,7 +272,7 @@ func toC(name, typ string) string {
 	}
 
 	if typ == "Memory" {
-		return fmt.Sprintf("%v.Pointer()", name)
+		return fmt.Sprintf("unsafe.Pointer(%v.Uintptr())", name)
 	}
 
 	// log.Printf("name %q typ %q", name, typ)
@@ -250,12 +291,14 @@ func toCType(goType string) string {
 }
 
 func getRetVal(cs *bindgen.CSignature) map[int]string {
+
 	name := cs.Name
 	outputs := outputParams[name]
 	ios := ioParams[name]
 	if len(outputs)+len(ios) == 0 {
 		return nil
 	}
+
 	retVal := make(map[int]string)
 	for i, p := range cs.Parameters() {
 		param := p.Name()
@@ -269,9 +312,11 @@ func getRetVal(cs *bindgen.CSignature) map[int]string {
 func getRetValOnly(cs *bindgen.CSignature) map[int]string {
 	name := cs.Name
 	outputs := outputParams[name]
+
 	if len(outputs) == 0 {
 		return nil
 	}
+
 	retVal := make(map[int]string)
 	for i, p := range cs.Parameters() {
 		param := p.Name()
diff --git a/cmd/gencudnn/parse.py b/cmd/gencudnn/parse.py
index 08d3b2a..6390139 100644
--- a/cmd/gencudnn/parse.py
+++ b/cmd/gencudnn/parse.py
@@ -14,7 +14,7 @@ def get():
 		with open("cache/docs.html", 'r') as f:
 			print("Using cache", file=sys.stderr)
 			return f.read()
-	r = requests.get("http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html")
+	r = requests.get("https://docs.nvidia.com/deeplearning/cudnn/api/index.html")
 	with open("cache/docs.html", 'w') as f:
 		f.write(r.text)
 	return r.text
@@ -22,10 +22,15 @@ def get():
 def main():
 	txt = get()
 	soup = BeautifulSoup(txt, "html5lib")
-	contents = soup.find_all(id="api-introduction")
-	topics = contents[0].find_all(class_="topic concept nested1")
-	for topic in topics:
+	contents = soup.find_all(id="contents")
+	ids = ["cudnn-ops-infer-so-library", "cudnn-ops-train-so-library", "cudnn-cnn-infer-so-library", "cudnn-cnn-train-so-library", "cudnn-adv-infer-so-library", "cudnn-adv-train-so-library", "cudnn-backend-api"]
+	topics = [contents[0].find_all(id=i)[0].find_all(id=re.compile("-api")) for i in ids]
+	topics = [t.find_all(class_="topic concept nested2") for topic in topics for t in topic]
+	topics = [t for ts in topics for t in ts]
+	#print(topics[0])
+	for i, topic in enumerate(topics):
 		rawFnName = topic.find_all(class_='title topictitle2')[0].text
+		rawFnName = rawFnName.rstrip('()')
 		try:
 			fnName = re.search('cudnn.+$', rawFnName).group(0)
 		except AttributeError as e:
@@ -36,13 +41,13 @@ def main():
 		except IndexError:
 			print("rawFnName: {} - topic has no dl class".format(fnName), file=sys.stderr)
 			continue
-
+		#print(paramsDL)
 		# check previous
-		if paramsDL.previous_sibling.previous_sibling.text != "Parameters":
-			print("rawFnName: {} has no params::: {}".format(fnName, paramsDL.previous_sibling), file=sys.stderr)
-			continue
+		#if paramsDL.previous_sibling.previous_sibling.text != "Parameters":
+		#	print("rawFnName: {} has no params::: {}".format(fnName, paramsDL.previous_sibling), file=sys.stderr)
+		#	continue
 
-		params = paramsDL.find_all(class_='dt dlterm') # name 
+		params = paramsDL.find_all(class_='dt dlterm') # name
 		paramsDesc = paramsDL.find_all(class_='dd')    # use type
 		paramUse = []
 		for d in paramsDesc:
@@ -71,8 +76,12 @@ def main():
 		except IndexError:
 			print("fnName: {} - no body".format(fnName), file=sys.stderr)
 			continue
-		# clear is better than clever. 
-		doc = docbody.find_all("p")[0].text
+		# clear is better than clever.
+		try:
+			doc = docbody.find_all("p")[0].text
+		except:
+			print("fnName: {} - no p".format(fnName), file=sys.stderr)
+			continue
 		doc = doc.replace("\n", "")
 		doc = re.sub("\t+", " ", doc)
 		doc = re.sub("\s+", " ", doc)
@@ -85,6 +94,7 @@ def main():
 
 	# write the go file
 	print("package main")
+	print("/* generated by parse.py. DO NOT EDIT */")
 	print("var inputParams = map[string][]string{")
 	for k, v in inputs.items():
 		if len(v) == 0: continue
@@ -123,4 +133,4 @@ def main():
 		print('"{}": "{}",'.format(k, v.strip()))
 	print("}")
 
-main()
\ No newline at end of file
+main()
diff --git a/cmd/genlib/README.md b/cmd/genlib/README.md
index 52544fe..ca0d120 100644
--- a/cmd/genlib/README.md
+++ b/cmd/genlib/README.md
@@ -1,18 +1,42 @@
-# genlibcu #
+# genlib #
 
-genlibcu is the program that generates the package `cu`. It does so by parsing `cuda.h` which is a modified form of `cuda.h` that comes with a CUDA installation. Specifially these were the modifications made:
+genlib is the program that generates the package `cu`. It does so by parsing `cuda.h` which is a modified form of `cuda.h` that comes with a CUDA installation. Specifially these were the modifications made:
 
 ```
-gcc -E -P cuda.h > cuda.h
-astyle --style=google --lineend=linux --indent=tab --indent-switches --align-pointer=type --align-reference=name --delete-empty-lines cuda.h
-sed -i 's/_v2//g' cuda.h
-sed -i 's/_v3//g' cuda.h
+cp /usr/local/cuda/include/cuda.h cuda2.h // copy cuda.h to current dir
+echo 'include "cuda2.h"' > cuda.c         // make a fake C file
+gcc -E -P cuda.c > cuda.h                 // ask GCC to perform preprocessing.
+astyle --style=google --lineend=linux --indent=tab --indent-switches --align-pointer=type --align-reference=name --delete-empty-lines cuda.h // fmt this
+sed -i 's/_v2//g' cuda.h                 // Remove _v2 stufff from cuda.h
+sed -i 's/_v3//g' cuda.h                 // Remove _v3 stuff from cuda.h
 sed -i -E 's/^#.+//g' cuda.h
 sed -i '/^$/N;/^\n$/D' cuda.h
 ```
 
+
 The first line preprocesses all the macros, leaving a singular header file. The second command processes the files in a way that is readable to me (and the generator program). THe last two commands replaces any v2/v3 that may be found The copyright notice from nVidia is then reattached.
 
+After that, the file is manually fixed by means of running `go run *.go`. The errors/panics are all related to parsing of C files (e.g. `unnamed fields not allowed`). These are manually fixed one by one thus:
+
+* `unnamed fields not allowed` - give said fields dummy names
+
+When the program finds an error, it will leave the file ungenerated, reporting errors instead. This allows for new versions of `cuda.h` to be adapted quickly.
+
+Here's an example moving from CUDA 9.0 to CUDA 11.0's API (CUDA10 didn't require all these changes so I postponed making them).
+
+When upgrading to support CUDA11, there were many new constructs that were introduced that needed a manual Go translation. The errors are that the `ctype`s are not known to the translator. The following line will simply output them.
+
+```
+$go run *.go 2>&1 >/dev/null | grep ctype > TODO
+```
+
+Once this list is gotten, we can then either
+
+a) manually make corresponding Go data structures.
+b) generate enums (if they are enums).
+c) ignore them.
+
+
 # Manually fixed after generation #
 
 * `MemsetD32`
@@ -50,10 +74,10 @@ The first line preprocesses all the macros, leaving a singular header file. The
 * `PopCurrentCtx` - deleted
 * `SetCurrentContext` - deleted
 * `CurrentContext` - deleted
-* `CurrentDevice` 
+* `CurrentDevice`
 * `CurrentFlags`
 * `CanAccessPeer` - deleted
 * `P2PAttribute` - deleted
 * `MemAllocManaged`
 
-## Ctx related methods - manually written ##
\ No newline at end of file
+## Ctx related methods - manually written ##
diff --git a/cmd/genlib/TODO b/cmd/genlib/TODO
new file mode 100644
index 0000000..91bf551
--- /dev/null
+++ b/cmd/genlib/TODO
@@ -0,0 +1,40 @@
+* Checklist [13/39]
+  - [X] CUresult cuGraphAddChildGraphNode ( CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, CUgraph childGraph )
+  - [X] CUresult cuGraphAddDependencies ( CUgraph hGraph, const CUgraphNode* from, const CUgraphNode* to, size_t numDependencies )
+  - [X] CUresult cuGraphAddEmptyNode ( CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies )
+  - [X] CUresult cuGraphAddHostNode ( CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS* nodeParams )
+  - [X] CUresult cuGraphAddKernelNode ( CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS* nodeParams )
+  - [X] CUresult cuGraphAddMemcpyNode ( CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_MEMCPY3D* copyParams, CUcontext ctx )
+  - [X] CUresult cuGraphAddMemsetNode ( CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS* memsetParams, CUcontext ctx )
+  - [ ] CUresult cuGraphChildGraphNodeGetGraph ( CUgraphNode hNode, CUgraph* phGraph )
+  - [X] CUresult cuGraphClone ( CUgraph* phGraphClone, CUgraph originalGraph )
+  - [X] CUresult cuGraphCreate ( CUgraph* phGraph, unsigned int  flags )
+  - [X] CUresult cuGraphDestroy ( CUgraph hGraph )
+  - [X] CUresult cuGraphDestroyNode ( CUgraphNode hNode )
+  - [X] CUresult cuGraphExecDestroy ( CUgraphExec hGraphExec )
+  - [ ] CUresult cuGraphExecHostNodeSetParams ( CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS* nodeParams )
+  - [ ] CUresult cuGraphExecKernelNodeSetParams ( CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS* nodeParams )
+  - [ ] CUresult cuGraphExecMemcpyNodeSetParams ( CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D* copyParams, CUcontext ctx )
+  - [ ] CUresult cuGraphExecMemsetNodeSetParams ( CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS* memsetParams, CUcontext ctx )
+  - [ ] CUresult cuGraphExecUpdate ( CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode* hErrorNode_out, CUgraphExecUpdateResult* updateResult_out )
+  - [X] CUresult cuGraphGetEdges ( CUgraph hGraph, CUgraphNode* from, CUgraphNode* to, size_t* numEdges )
+  - [ ] CUresult cuGraphGetNodes ( CUgraph hGraph, CUgraphNode* nodes, size_t* numNodes )
+  - [ ] CUresult cuGraphGetRootNodes ( CUgraph hGraph, CUgraphNode* rootNodes, size_t* numRootNodes )
+  - [ ] CUresult cuGraphHostNodeGetParams ( CUgraphNode hNode, CUDA_HOST_NODE_PARAMS* nodeParams )
+  - [ ] CUresult cuGraphHostNodeSetParams ( CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS* nodeParams )
+  - [ ] CUresult cuGraphInstantiate ( CUgraphExec* phGraphExec, CUgraph hGraph, CUgraphNode* phErrorNode, char* logBuffer, size_t bufferSize )
+  - [ ] CUresult cuGraphKernelNodeCopyAttributes ( CUgraphNode dst, CUgraphNode src )
+  - [ ] CUresult cuGraphKernelNodeGetAttribute ( CUgraphNode hNode, CUkernelNodeAttrID attr, CUkernelNodeAttrValue* value_out )
+  - [ ] CUresult cuGraphKernelNodeGetParams ( CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS* nodeParams )
+  - [ ] CUresult cuGraphKernelNodeSetAttribute ( CUgraphNode hNode, CUkernelNodeAttrID attr, const CUkernelNodeAttrValue* value )
+  - [ ] CUresult cuGraphKernelNodeSetParams ( CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS* nodeParams )
+  - [ ] CUresult cuGraphLaunch ( CUgraphExec hGraphExec, CUstream hStream )
+  - [ ] CUresult cuGraphMemcpyNodeGetParams ( CUgraphNode hNode, CUDA_MEMCPY3D* nodeParams )
+  - [ ] CUresult cuGraphMemcpyNodeSetParams ( CUgraphNode hNode, const CUDA_MEMCPY3D* nodeParams )
+  - [ ] CUresult cuGraphMemsetNodeGetParams ( CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS* nodeParams )
+  - [ ] CUresult cuGraphMemsetNodeSetParams ( CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS* nodeParams )
+  - [ ] CUresult cuGraphNodeFindInClone ( CUgraphNode* phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph )
+  - [ ] CUresult cuGraphNodeGetDependencies ( CUgraphNode hNode, CUgraphNode* dependencies, size_t* numDependencies )
+  - [ ] CUresult cuGraphNodeGetDependentNodes ( CUgraphNode hNode, CUgraphNode* dependentNodes, size_t* numDependentNodes )
+  - [ ] CUresult cuGraphNodeGetType ( CUgraphNode hNode, CUgraphNodeType* type )
+  - [ ] CUresult cuGraphRemoveDependencies ( CUgraph hGraph, const CUgraphNode* from, const CUgraphNode* to, size_t numDependencies )
diff --git a/cmd/genlib/cuda.h b/cmd/genlib/cuda.h
index 4a9463f..5608b5f 100644
--- a/cmd/genlib/cuda.h
+++ b/cmd/genlib/cuda.h
@@ -1,4 +1,3 @@
-
 typedef long unsigned int size_t;
 typedef int wchar_t;
 
@@ -7,98 +6,77 @@ typedef enum {
 	P_PID,
 	P_PGID
 } idtype_t;
-
 typedef struct {
 	int quot;
 	int rem;
 } div_t;
-
 typedef struct {
 	long int quot;
 	long int rem;
 } ldiv_t;
-
 __extension__ typedef struct {
 	long long int quot;
 	long long int rem;
 } lldiv_t;
-extern size_t __ctype_get_mb_cur_max (void) __attribute__ ((__nothrow__ , __leaf__)) ;
-
+extern size_t __ctype_get_mb_cur_max (void) __attribute__ ((__nothrow__, __leaf__)) ;
 extern double atof (const char* __nptr)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1))) ;
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1))) ;
 extern int atoi (const char* __nptr)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1))) ;
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1))) ;
 extern long int atol (const char* __nptr)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1))) ;
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1))) ;
 __extension__ extern long long int atoll (const char* __nptr)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1))) ;
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1))) ;
 extern double strtod (const char* __restrict __nptr,
                       char** __restrict __endptr)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
 extern float strtof (const char* __restrict __nptr,
-                     char** __restrict __endptr) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-
+                     char** __restrict __endptr) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
 extern long double strtold (const char* __restrict __nptr,
                             char** __restrict __endptr)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
 extern long int strtol (const char* __restrict __nptr,
                         char** __restrict __endptr, int __base)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
 extern unsigned long int strtoul (const char* __restrict __nptr,
                                   char** __restrict __endptr, int __base)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
 __extension__
 extern long long int strtoq (const char* __restrict __nptr,
                              char** __restrict __endptr, int __base)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
 __extension__
 extern unsigned long long int strtouq (const char* __restrict __nptr,
                                        char** __restrict __endptr, int __base)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
 __extension__
 extern long long int strtoll (const char* __restrict __nptr,
                               char** __restrict __endptr, int __base)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
 __extension__
 extern unsigned long long int strtoull (const char* __restrict __nptr,
                                         char** __restrict __endptr, int __base)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-extern char* l64a (long int __n) __attribute__ ((__nothrow__ , __leaf__)) ;
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
+extern char* l64a (long int __n) __attribute__ ((__nothrow__, __leaf__)) ;
 extern long int a64l (const char* __s)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1))) ;
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1))) ;
 
 typedef unsigned char __u_char;
 typedef unsigned short int __u_short;
 typedef unsigned int __u_int;
 typedef unsigned long int __u_long;
-
 typedef signed char __int8_t;
 typedef unsigned char __uint8_t;
 typedef signed short int __int16_t;
 typedef unsigned short int __uint16_t;
 typedef signed int __int32_t;
 typedef unsigned int __uint32_t;
-
 typedef signed long int __int64_t;
 typedef unsigned long int __uint64_t;
-
 typedef long int __quad_t;
 typedef unsigned long int __u_quad_t;
-
 typedef long int __intmax_t;
 typedef unsigned long int __uintmax_t;
-
 typedef unsigned long int __dev_t;
 typedef unsigned int __uid_t;
 typedef unsigned int __gid_t;
@@ -119,43 +97,26 @@ typedef unsigned int __id_t;
 typedef long int __time_t;
 typedef unsigned int __useconds_t;
 typedef long int __suseconds_t;
-
 typedef int __daddr_t;
 typedef int __key_t;
-
 typedef int __clockid_t;
-
 typedef void* __timer_t;
-
 typedef long int __blksize_t;
-
 typedef long int __blkcnt_t;
 typedef long int __blkcnt64_t;
-
 typedef unsigned long int __fsblkcnt_t;
 typedef unsigned long int __fsblkcnt64_t;
-
 typedef unsigned long int __fsfilcnt_t;
 typedef unsigned long int __fsfilcnt64_t;
-
 typedef long int __fsword_t;
-
 typedef long int __ssize_t;
-
 typedef long int __syscall_slong_t;
-
 typedef unsigned long int __syscall_ulong_t;
-
 typedef __off64_t __loff_t;
-typedef __quad_t* __qaddr_t;
 typedef char* __caddr_t;
-
 typedef long int __intptr_t;
-
 typedef unsigned int __socklen_t;
-
 typedef int __sig_atomic_t;
-
 typedef __u_char u_char;
 typedef __u_short u_short;
 typedef __u_int u_int;
@@ -163,44 +124,27 @@ typedef __u_long u_long;
 typedef __quad_t quad_t;
 typedef __u_quad_t u_quad_t;
 typedef __fsid_t fsid_t;
-
 typedef __loff_t loff_t;
-
 typedef __ino_t ino_t;
 typedef __dev_t dev_t;
-
 typedef __gid_t gid_t;
-
 typedef __mode_t mode_t;
-
 typedef __nlink_t nlink_t;
-
 typedef __uid_t uid_t;
-
 typedef __off_t off_t;
 typedef __pid_t pid_t;
-
 typedef __id_t id_t;
-
 typedef __ssize_t ssize_t;
-
 typedef __daddr_t daddr_t;
 typedef __caddr_t caddr_t;
-
 typedef __key_t key_t;
-
 typedef __clock_t clock_t;
-
 typedef __clockid_t clockid_t;
-
 typedef __time_t time_t;
-
 typedef __timer_t timer_t;
-
 typedef unsigned long int ulong;
 typedef unsigned short int ushort;
 typedef unsigned int uint;
-
 typedef __int8_t int8_t;
 typedef __int16_t int16_t;
 typedef __int32_t int32_t;
@@ -209,9 +153,7 @@ typedef unsigned int u_int8_t __attribute__ ((__mode__ (__QI__)));
 typedef unsigned int u_int16_t __attribute__ ((__mode__ (__HI__)));
 typedef unsigned int u_int32_t __attribute__ ((__mode__ (__SI__)));
 typedef unsigned int u_int64_t __attribute__ ((__mode__ (__DI__)));
-
 typedef int register_t __attribute__ ((__mode__ (__word__)));
-
 static __inline unsigned int
 __bswap_32 (unsigned int __bsx) {
 	return __builtin_bswap32 (__bsx);
@@ -224,42 +166,31 @@ static __inline __uint16_t
 __uint16_identity (__uint16_t __x) {
 	return __x;
 }
-
 static __inline __uint32_t
 __uint32_identity (__uint32_t __x) {
 	return __x;
 }
-
 static __inline __uint64_t
 __uint64_identity (__uint64_t __x) {
 	return __x;
 }
-
 typedef struct {
 	unsigned long int __val[(1024 / (8 * sizeof (unsigned long int)))];
 } __sigset_t;
-
 typedef __sigset_t sigset_t;
-
 struct timeval {
 	__time_t tv_sec;
 	__suseconds_t tv_usec;
 };
-
 struct timespec {
 	__time_t tv_sec;
 	__syscall_slong_t tv_nsec;
 };
-
 typedef __suseconds_t suseconds_t;
-
 typedef long int __fd_mask;
 typedef struct {
-
 	__fd_mask __fds_bits[1024 / (8 * (int) sizeof (__fd_mask))];
-
 } fd_set;
-
 typedef __fd_mask fd_mask;
 
 extern int select (int __nfds, fd_set* __restrict __readfds,
@@ -272,16 +203,13 @@ extern int pselect (int __nfds, fd_set* __restrict __readfds,
                     const struct timespec* __restrict __timeout,
                     const __sigset_t* __restrict __sigmask);
 
-extern unsigned int gnu_dev_major (__dev_t __dev) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__const__));
-extern unsigned int gnu_dev_minor (__dev_t __dev) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__const__));
-extern __dev_t gnu_dev_makedev (unsigned int __major, unsigned int __minor) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__const__));
+extern unsigned int gnu_dev_major (__dev_t __dev) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__const__));
+extern unsigned int gnu_dev_minor (__dev_t __dev) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__const__));
+extern __dev_t gnu_dev_makedev (unsigned int __major, unsigned int __minor) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__const__));
 
 typedef __blksize_t blksize_t;
-
 typedef __blkcnt_t blkcnt_t;
-
 typedef __fsblkcnt_t fsblkcnt_t;
-
 typedef __fsfilcnt_t fsfilcnt_t;
 struct __pthread_rwlock_arch_t {
 	unsigned int __readers;
@@ -290,18 +218,13 @@ struct __pthread_rwlock_arch_t {
 	unsigned int __writers_futex;
 	unsigned int __pad3;
 	unsigned int __pad4;
-
 	int __cur_writer;
 	int __shared;
 	signed char __rwelision;
-
 	unsigned char __pad1[7];
-
 	unsigned long int __pad2;
-
 	unsigned int __flags;
 };
-
 typedef struct __pthread_internal_list {
 	struct __pthread_internal_list* __prev;
 	struct __pthread_internal_list* __next;
@@ -310,9 +233,7 @@ struct __pthread_mutex_s {
 	int __lock ;
 	unsigned int __count;
 	int __owner;
-
 	unsigned int __nusers;
-
 	int __kind;
 
 	short __spins;
@@ -320,7 +241,6 @@ struct __pthread_mutex_s {
 	__pthread_list_t __list;
 
 };
-
 struct __pthread_cond_s {
 	__extension__ union {
 		__extension__ unsigned long long int __wseq;
@@ -328,88 +248,70 @@ struct __pthread_cond_s {
 			unsigned int __low;
 			unsigned int __high;
 		} __wseq32;
-	} XXX;
+	} foo;
 	__extension__ union {
 		__extension__ unsigned long long int __g1_start;
 		struct {
 			unsigned int __low;
 			unsigned int __high;
 		} __g1_start32;
-	} YYY;
+	} bar ;
 	unsigned int __g_refs[2] ;
 	unsigned int __g_size[2];
 	unsigned int __g1_orig_size;
 	unsigned int __wrefs;
 	unsigned int __g_signals[2];
 };
-
 typedef unsigned long int pthread_t;
-
 typedef union {
 	char __size[4];
 	int __align;
 } pthread_mutexattr_t;
-
 typedef union {
 	char __size[4];
 	int __align;
 } pthread_condattr_t;
-
 typedef unsigned int pthread_key_t;
-
 typedef int pthread_once_t;
-
 union pthread_attr_t {
 	char __size[56];
 	long int __align;
 };
-
 typedef union pthread_attr_t pthread_attr_t;
-
 typedef union {
 	struct __pthread_mutex_s __data;
 	char __size[40];
 	long int __align;
 } pthread_mutex_t;
-
 typedef union {
 	struct __pthread_cond_s __data;
 	char __size[48];
 	__extension__ long long int __align;
 } pthread_cond_t;
-
 typedef union {
 	struct __pthread_rwlock_arch_t __data;
 	char __size[56];
 	long int __align;
 } pthread_rwlock_t;
-
 typedef union {
 	char __size[8];
 	long int __align;
 } pthread_rwlockattr_t;
-
 typedef volatile int pthread_spinlock_t;
-
 typedef union {
 	char __size[32];
 	long int __align;
 } pthread_barrier_t;
-
 typedef union {
 	char __size[4];
 	int __align;
 } pthread_barrierattr_t;
 
-extern long int random (void) __attribute__ ((__nothrow__ , __leaf__));
-
-extern void srandom (unsigned int __seed) __attribute__ ((__nothrow__ , __leaf__));
-
+extern long int random (void) __attribute__ ((__nothrow__, __leaf__));
+extern void srandom (unsigned int __seed) __attribute__ ((__nothrow__, __leaf__));
 extern char* initstate (unsigned int __seed, char* __statebuf,
-                        size_t __statelen) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (2)));
-
-extern char* setstate (char* __statebuf) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-
+                        size_t __statelen) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (2)));
+extern char* setstate (char* __statebuf) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
 struct random_data {
 	int32_t* fptr;
 	int32_t* rptr;
@@ -419,249 +321,189 @@ struct random_data {
 	int rand_sep;
 	int32_t* end_ptr;
 };
-
 extern int random_r (struct random_data* __restrict __buf,
-                     int32_t* __restrict __result) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
-
+                     int32_t* __restrict __result) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
 extern int srandom_r (unsigned int __seed, struct random_data* __buf)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (2)));
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (2)));
 extern int initstate_r (unsigned int __seed, char* __restrict __statebuf,
                         size_t __statelen,
                         struct random_data* __restrict __buf)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (2, 4)));
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (2, 4)));
 extern int setstate_r (char* __restrict __statebuf,
                        struct random_data* __restrict __buf)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
-
-extern int rand (void) __attribute__ ((__nothrow__ , __leaf__));
-
-extern void srand (unsigned int __seed) __attribute__ ((__nothrow__ , __leaf__));
-
-extern int rand_r (unsigned int* __seed) __attribute__ ((__nothrow__ , __leaf__));
-
-extern double drand48 (void) __attribute__ ((__nothrow__ , __leaf__));
-extern double erand48 (unsigned short int __xsubi[3]) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-
-extern long int lrand48 (void) __attribute__ ((__nothrow__ , __leaf__));
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
+extern int rand (void) __attribute__ ((__nothrow__, __leaf__));
+extern void srand (unsigned int __seed) __attribute__ ((__nothrow__, __leaf__));
+extern int rand_r (unsigned int* __seed) __attribute__ ((__nothrow__, __leaf__));
+extern double drand48 (void) __attribute__ ((__nothrow__, __leaf__));
+extern double erand48 (unsigned short int __xsubi[3]) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
+extern long int lrand48 (void) __attribute__ ((__nothrow__, __leaf__));
 extern long int nrand48 (unsigned short int __xsubi[3])
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-
-extern long int mrand48 (void) __attribute__ ((__nothrow__ , __leaf__));
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
+extern long int mrand48 (void) __attribute__ ((__nothrow__, __leaf__));
 extern long int jrand48 (unsigned short int __xsubi[3])
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-
-extern void srand48 (long int __seedval) __attribute__ ((__nothrow__ , __leaf__));
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
+extern void srand48 (long int __seedval) __attribute__ ((__nothrow__, __leaf__));
 extern unsigned short int* seed48 (unsigned short int __seed16v[3])
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-extern void lcong48 (unsigned short int __param[7]) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
+extern void lcong48 (unsigned short int __param[7]) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
 struct drand48_data {
 	unsigned short int __x[3];
 	unsigned short int __old_x[3];
 	unsigned short int __c;
 	unsigned short int __init;
 	__extension__ unsigned long long int __a;
-
 };
-
 extern int drand48_r (struct drand48_data* __restrict __buffer,
-                      double* __restrict __result) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
+                      double* __restrict __result) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
 extern int erand48_r (unsigned short int __xsubi[3],
                       struct drand48_data* __restrict __buffer,
-                      double* __restrict __result) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
-
+                      double* __restrict __result) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
 extern int lrand48_r (struct drand48_data* __restrict __buffer,
                       long int* __restrict __result)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
 extern int nrand48_r (unsigned short int __xsubi[3],
                       struct drand48_data* __restrict __buffer,
                       long int* __restrict __result)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
 extern int mrand48_r (struct drand48_data* __restrict __buffer,
                       long int* __restrict __result)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
 extern int jrand48_r (unsigned short int __xsubi[3],
                       struct drand48_data* __restrict __buffer,
                       long int* __restrict __result)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
 extern int srand48_r (long int __seedval, struct drand48_data* __buffer)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (2)));
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (2)));
 extern int seed48_r (unsigned short int __seed16v[3],
-                     struct drand48_data* __buffer) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
-
+                     struct drand48_data* __buffer) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
 extern int lcong48_r (unsigned short int __param[7],
                       struct drand48_data* __buffer)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
-
-extern void* malloc (size_t __size) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__malloc__)) ;
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1, 2)));
+extern void* malloc (size_t __size) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__malloc__)) ;
 extern void* calloc (size_t __nmemb, size_t __size)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__malloc__)) ;
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__malloc__)) ;
 extern void* realloc (void* __ptr, size_t __size)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__warn_unused_result__));
-extern void free (void* __ptr) __attribute__ ((__nothrow__ , __leaf__));
-
-extern void* alloca (size_t __size) __attribute__ ((__nothrow__ , __leaf__));
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__warn_unused_result__));
+extern void free (void* __ptr) __attribute__ ((__nothrow__, __leaf__));
 
-extern void* valloc (size_t __size) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__malloc__)) ;
+extern void* alloca (size_t __size) __attribute__ ((__nothrow__, __leaf__));
 
+extern void* valloc (size_t __size) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__malloc__)) ;
 extern int posix_memalign (void** __memptr, size_t __alignment, size_t __size)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1))) ;
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1))) ;
 extern void* aligned_alloc (size_t __alignment, size_t __size)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__malloc__)) __attribute__ ((__alloc_size__ (2))) ;
-
-extern void abort (void) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__noreturn__));
-
-extern int atexit (void (*__func) (void)) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-
-extern int at_quick_exit (void (*__func) (void)) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__malloc__)) __attribute__ ((__alloc_size__ (2))) ;
+extern void abort (void) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__noreturn__));
+extern int atexit (void (*__func) (void)) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
+extern int at_quick_exit (void (*__func) (void)) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
 extern int on_exit (void (*__func) (int __status, void* __arg), void* __arg)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-
-extern void exit (int __status) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__noreturn__));
-
-extern void quick_exit (int __status) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__noreturn__));
-
-extern void _Exit (int __status) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__noreturn__));
-
-extern char* getenv (const char* __name) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1))) ;
-extern int putenv (char* __string) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
+extern void exit (int __status) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__noreturn__));
+extern void quick_exit (int __status) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__noreturn__));
+extern void _Exit (int __status) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__noreturn__));
+extern char* getenv (const char* __name) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1))) ;
+extern int putenv (char* __string) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
 extern int setenv (const char* __name, const char* __value, int __replace)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (2)));
-
-extern int unsetenv (const char* __name) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
-
-extern int clearenv (void) __attribute__ ((__nothrow__ , __leaf__));
-extern char* mktemp (char* __template) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (2)));
+extern int unsetenv (const char* __name) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
+extern int clearenv (void) __attribute__ ((__nothrow__, __leaf__));
+extern char* mktemp (char* __template) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
 extern int mkstemp (char* __template) __attribute__ ((__nonnull__ (1))) ;
 extern int mkstemps (char* __template, int __suffixlen) __attribute__ ((__nonnull__ (1))) ;
-extern char* mkdtemp (char* __template) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1))) ;
+extern char* mkdtemp (char* __template) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1))) ;
 extern int system (const char* __command) ;
 extern char* realpath (const char* __restrict __name,
-                       char* __restrict __resolved) __attribute__ ((__nothrow__ , __leaf__)) ;
-
+                       char* __restrict __resolved) __attribute__ ((__nothrow__, __leaf__)) ;
 typedef int (*__compar_fn_t) (const void*, const void*);
 extern void* bsearch (const void* __key, const void* __base,
                       size_t __nmemb, size_t __size, __compar_fn_t __compar)
 __attribute__ ((__nonnull__ (1, 2, 5))) ;
-
 extern void qsort (void* __base, size_t __nmemb, size_t __size,
                    __compar_fn_t __compar) __attribute__ ((__nonnull__ (1, 4)));
-extern int abs (int __x) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__const__)) ;
-extern long int labs (long int __x) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__const__)) ;
-
+extern int abs (int __x) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__const__)) ;
+extern long int labs (long int __x) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__const__)) ;
 __extension__ extern long long int llabs (long long int __x)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__const__)) ;
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__const__)) ;
 extern div_t div (int __numer, int __denom)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__const__)) ;
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__const__)) ;
 extern ldiv_t ldiv (long int __numer, long int __denom)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__const__)) ;
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__const__)) ;
 __extension__ extern lldiv_t lldiv (long long int __numer,
                                     long long int __denom)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__const__)) ;
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__const__)) ;
 extern char* ecvt (double __value, int __ndigit, int* __restrict __decpt,
-                   int* __restrict __sign) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (3, 4))) ;
-
+                   int* __restrict __sign) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (3, 4))) ;
 extern char* fcvt (double __value, int __ndigit, int* __restrict __decpt,
-                   int* __restrict __sign) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (3, 4))) ;
-
+                   int* __restrict __sign) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (3, 4))) ;
 extern char* gcvt (double __value, int __ndigit, char* __buf)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (3))) ;
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (3))) ;
 extern char* qecvt (long double __value, int __ndigit,
                     int* __restrict __decpt, int* __restrict __sign)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (3, 4))) ;
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (3, 4))) ;
 extern char* qfcvt (long double __value, int __ndigit,
                     int* __restrict __decpt, int* __restrict __sign)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (3, 4))) ;
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (3, 4))) ;
 extern char* qgcvt (long double __value, int __ndigit, char* __buf)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (3))) ;
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (3))) ;
 extern int ecvt_r (double __value, int __ndigit, int* __restrict __decpt,
                    int* __restrict __sign, char* __restrict __buf,
-                   size_t __len) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (3, 4, 5)));
+                   size_t __len) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (3, 4, 5)));
 extern int fcvt_r (double __value, int __ndigit, int* __restrict __decpt,
                    int* __restrict __sign, char* __restrict __buf,
-                   size_t __len) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (3, 4, 5)));
-
+                   size_t __len) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (3, 4, 5)));
 extern int qecvt_r (long double __value, int __ndigit,
                     int* __restrict __decpt, int* __restrict __sign,
                     char* __restrict __buf, size_t __len)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (3, 4, 5)));
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (3, 4, 5)));
 extern int qfcvt_r (long double __value, int __ndigit,
                     int* __restrict __decpt, int* __restrict __sign,
                     char* __restrict __buf, size_t __len)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (3, 4, 5)));
-
-extern int mblen (const char* __s, size_t __n) __attribute__ ((__nothrow__ , __leaf__));
-
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (3, 4, 5)));
+extern int mblen (const char* __s, size_t __n) __attribute__ ((__nothrow__, __leaf__));
 extern int mbtowc (wchar_t* __restrict __pwc,
-                   const char* __restrict __s, size_t __n) __attribute__ ((__nothrow__ , __leaf__));
-
-extern int wctomb (char* __s, wchar_t __wchar) __attribute__ ((__nothrow__ , __leaf__));
-
+                   const char* __restrict __s, size_t __n) __attribute__ ((__nothrow__, __leaf__));
+extern int wctomb (char* __s, wchar_t __wchar) __attribute__ ((__nothrow__, __leaf__));
 extern size_t mbstowcs (wchar_t* __restrict __pwcs,
-                        const char* __restrict __s, size_t __n) __attribute__ ((__nothrow__ , __leaf__));
-
+                        const char* __restrict __s, size_t __n) __attribute__ ((__nothrow__, __leaf__));
 extern size_t wcstombs (char* __restrict __s,
                         const wchar_t* __restrict __pwcs, size_t __n)
-__attribute__ ((__nothrow__ , __leaf__));
-
-extern int rpmatch (const char* __response) __attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1))) ;
+__attribute__ ((__nothrow__, __leaf__));
+extern int rpmatch (const char* __response) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1))) ;
 extern int getsubopt (char** __restrict __optionp,
                       char* const* __restrict __tokens,
                       char** __restrict __valuep)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1, 2, 3))) ;
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1, 2, 3))) ;
 extern int getloadavg (double __loadavg[], int __nelem)
-__attribute__ ((__nothrow__ , __leaf__)) __attribute__ ((__nonnull__ (1)));
+__attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__nonnull__ (1)));
 
 typedef __uint8_t uint8_t;
 typedef __uint16_t uint16_t;
 typedef __uint32_t uint32_t;
 typedef __uint64_t uint64_t;
-
 typedef signed char int_least8_t;
 typedef short int int_least16_t;
 typedef int int_least32_t;
-
 typedef long int int_least64_t;
-
 typedef unsigned char uint_least8_t;
 typedef unsigned short int uint_least16_t;
 typedef unsigned int uint_least32_t;
-
 typedef unsigned long int uint_least64_t;
 typedef signed char int_fast8_t;
-
 typedef long int int_fast16_t;
 typedef long int int_fast32_t;
 typedef long int int_fast64_t;
 typedef unsigned char uint_fast8_t;
-
 typedef unsigned long int uint_fast16_t;
 typedef unsigned long int uint_fast32_t;
 typedef unsigned long int uint_fast64_t;
 typedef long int intptr_t;
-
 typedef unsigned long int uintptr_t;
 typedef __intmax_t intmax_t;
 typedef __uintmax_t uintmax_t;
-
 typedef uint32_t cuuint32_t;
 typedef uint64_t cuuint64_t;
 typedef unsigned long long CUdeviceptr;
-
 typedef int CUdevice;
 typedef struct CUctx_st* CUcontext;
 typedef struct CUmod_st* CUmodule;
@@ -675,41 +517,39 @@ typedef struct CUstream_st* CUstream;
 typedef struct CUgraphicsResource_st* CUgraphicsResource;
 typedef unsigned long long CUtexObject;
 typedef unsigned long long CUsurfObject;
-
+typedef struct CUextMemory_st* CUexternalMemory;
+typedef struct CUextSemaphore_st* CUexternalSemaphore;
+typedef struct CUgraph_st* CUgraph;
+typedef struct CUgraphNode_st* CUgraphNode;
+typedef struct CUgraphExec_st* CUgraphExec;
 typedef struct CUuuid_st {
 	char bytes[16];
 } CUuuid;
 typedef struct CUipcEventHandle_st {
 	char reserved[64];
 } CUipcEventHandle;
-
 typedef struct CUipcMemHandle_st {
 	char reserved[64];
 } CUipcMemHandle;
-
 typedef enum CUipcMem_flags_enum {
 	CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1
 } CUipcMem_flags;
-
 typedef enum CUmemAttach_flags_enum {
 	CU_MEM_ATTACH_GLOBAL = 0x1,
 	CU_MEM_ATTACH_HOST = 0x2,
 	CU_MEM_ATTACH_SINGLE = 0x4
 } CUmemAttach_flags;
-
 typedef enum CUctx_flags_enum {
 	CU_CTX_SCHED_AUTO = 0x00,
 	CU_CTX_SCHED_SPIN = 0x01,
 	CU_CTX_SCHED_YIELD = 0x02,
 	CU_CTX_SCHED_BLOCKING_SYNC = 0x04,
 	CU_CTX_BLOCKING_SYNC = 0x04,
-
 	CU_CTX_SCHED_MASK = 0x07,
 	CU_CTX_MAP_HOST = 0x08,
 	CU_CTX_LMEM_RESIZE_TO_MAX = 0x10,
 	CU_CTX_FLAGS_MASK = 0x1f
 } CUctx_flags;
-
 typedef enum CUstream_flags_enum {
 	CU_STREAM_DEFAULT = 0x0,
 	CU_STREAM_NON_BLOCKING = 0x1
@@ -720,29 +560,24 @@ typedef enum CUevent_flags_enum {
 	CU_EVENT_DISABLE_TIMING = 0x2,
 	CU_EVENT_INTERPROCESS = 0x4
 } CUevent_flags;
-
 typedef enum CUstreamWaitValue_flags_enum {
 	CU_STREAM_WAIT_VALUE_GEQ = 0x0,
-
 	CU_STREAM_WAIT_VALUE_EQ = 0x1,
 	CU_STREAM_WAIT_VALUE_AND = 0x2,
+	CU_STREAM_WAIT_VALUE_NOR = 0x3,
 	CU_STREAM_WAIT_VALUE_FLUSH = 1<<30
-
 } CUstreamWaitValue_flags;
-
 typedef enum CUstreamWriteValue_flags_enum {
 	CU_STREAM_WRITE_VALUE_DEFAULT = 0x0,
 	CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1
-
 } CUstreamWriteValue_flags;
-
 typedef enum CUstreamBatchMemOpType_enum {
 	CU_STREAM_MEM_OP_WAIT_VALUE_32 = 1,
 	CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2,
+	CU_STREAM_MEM_OP_WAIT_VALUE_64 = 4,
+	CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5,
 	CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3
-
 } CUstreamBatchMemOpType;
-
 typedef union CUstreamBatchMemOpParams_union {
 	CUstreamBatchMemOpType operation;
 	struct CUstreamMemOpWaitValueParams_st {
@@ -750,8 +585,8 @@ typedef union CUstreamBatchMemOpParams_union {
 		CUdeviceptr address;
 		union {
 			cuuint32_t value;
-			cuuint64_t pad;
-		} XXX;
+			cuuint64_t value64;
+		} foo;
 		unsigned int flags;
 		CUdeviceptr alias;
 	} waitValue;
@@ -760,8 +595,8 @@ typedef union CUstreamBatchMemOpParams_union {
 		CUdeviceptr address;
 		union {
 			cuuint32_t value;
-			cuuint64_t pad;
-		} YYY;
+			cuuint64_t value64;
+		} bar;
 		unsigned int flags;
 		CUdeviceptr alias;
 	} writeValue;
@@ -771,12 +606,10 @@ typedef union CUstreamBatchMemOpParams_union {
 	} flushRemoteWrites;
 	cuuint64_t pad[6];
 } CUstreamBatchMemOpParams;
-
 typedef enum CUoccupancy_flags_enum {
 	CU_OCCUPANCY_DEFAULT = 0x0,
 	CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1
 } CUoccupancy_flags;
-
 typedef enum CUarray_format_enum {
 	CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
 	CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
@@ -787,19 +620,16 @@ typedef enum CUarray_format_enum {
 	CU_AD_FORMAT_HALF = 0x10,
 	CU_AD_FORMAT_FLOAT = 0x20
 } CUarray_format;
-
 typedef enum CUaddress_mode_enum {
 	CU_TR_ADDRESS_MODE_WRAP = 0,
 	CU_TR_ADDRESS_MODE_CLAMP = 1,
 	CU_TR_ADDRESS_MODE_MIRROR = 2,
 	CU_TR_ADDRESS_MODE_BORDER = 3
 } CUaddress_mode;
-
 typedef enum CUfilter_mode_enum {
 	CU_TR_FILTER_MODE_POINT = 0,
 	CU_TR_FILTER_MODE_LINEAR = 1
 } CUfilter_mode;
-
 typedef enum CUdevice_attribute_enum {
 	CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,
 	CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,
@@ -897,9 +727,28 @@ typedef enum CUdevice_attribute_enum {
 	CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,
 	CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,
 	CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91,
+	CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,
+	CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,
+	CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,
+	CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,
+	CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,
+	CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97,
+	CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,
+	CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,
+	CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100,
+	CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101,
+	CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102,
+	CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103,
+	CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104,
+	CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105,
+	CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106,
+	CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107,
+	CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108,
+	CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109,
+	CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110,
+	CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111,
 	CU_DEVICE_ATTRIBUTE_MAX
 } CUdevice_attribute;
-
 typedef struct CUdevprop_st {
 	int maxThreadsPerBlock;
 	int maxThreadsDim[3];
@@ -912,7 +761,6 @@ typedef struct CUdevprop_st {
 	int clockRate;
 	int textureAlign;
 } CUdevprop;
-
 typedef enum CUpointer_attribute_enum {
 	CU_POINTER_ATTRIBUTE_CONTEXT = 1,
 	CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2,
@@ -921,54 +769,55 @@ typedef enum CUpointer_attribute_enum {
 	CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5,
 	CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6,
 	CU_POINTER_ATTRIBUTE_BUFFER_ID = 7,
-	CU_POINTER_ATTRIBUTE_IS_MANAGED = 8
+	CU_POINTER_ATTRIBUTE_IS_MANAGED = 8,
+	CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9,
+	CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10,
+	CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11,
+	CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12,
+	CU_POINTER_ATTRIBUTE_MAPPED = 13,
+	CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14,
+	CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15
 } CUpointer_attribute;
-
 typedef enum CUfunction_attribute_enum {
-
 	CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
-
 	CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
-
 	CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
-
 	CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
-
 	CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
 	CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
 	CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
-
 	CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,
-
+	CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
+	CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
 	CU_FUNC_ATTRIBUTE_MAX
 } CUfunction_attribute;
-
 typedef enum CUfunc_cache_enum {
 	CU_FUNC_CACHE_PREFER_NONE = 0x00,
 	CU_FUNC_CACHE_PREFER_SHARED = 0x01,
 	CU_FUNC_CACHE_PREFER_L1 = 0x02,
 	CU_FUNC_CACHE_PREFER_EQUAL = 0x03
 } CUfunc_cache;
-
 typedef enum CUsharedconfig_enum {
 	CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00,
 	CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01,
 	CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02
 } CUsharedconfig;
-
+typedef enum CUshared_carveout_enum {
+	CU_SHAREDMEM_CARVEOUT_DEFAULT = -1,
+	CU_SHAREDMEM_CARVEOUT_MAX_SHARED = 100,
+	CU_SHAREDMEM_CARVEOUT_MAX_L1 = 0
+} CUshared_carveout;
 typedef enum CUmemorytype_enum {
 	CU_MEMORYTYPE_HOST = 0x01,
 	CU_MEMORYTYPE_DEVICE = 0x02,
 	CU_MEMORYTYPE_ARRAY = 0x03,
 	CU_MEMORYTYPE_UNIFIED = 0x04
 } CUmemorytype;
-
 typedef enum CUcomputemode_enum {
 	CU_COMPUTEMODE_DEFAULT = 0,
 	CU_COMPUTEMODE_PROHIBITED = 2,
 	CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3
 } CUcomputemode;
-
 typedef enum CUmem_advise_enum {
 	CU_MEM_ADVISE_SET_READ_MOSTLY = 1,
 	CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2,
@@ -977,52 +826,36 @@ typedef enum CUmem_advise_enum {
 	CU_MEM_ADVISE_SET_ACCESSED_BY = 5,
 	CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6
 } CUmem_advise;
-
 typedef enum CUmem_range_attribute_enum {
 	CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1,
 	CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2,
 	CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3,
 	CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4
 } CUmem_range_attribute;
-
 typedef enum CUjit_option_enum {
-
 	CU_JIT_MAX_REGISTERS = 0,
 	CU_JIT_THREADS_PER_BLOCK,
-
 	CU_JIT_WALL_TIME,
 	CU_JIT_INFO_LOG_BUFFER,
 	CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
 	CU_JIT_ERROR_LOG_BUFFER,
 	CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
-
 	CU_JIT_OPTIMIZATION_LEVEL,
-
 	CU_JIT_TARGET_FROM_CUCONTEXT,
-
 	CU_JIT_TARGET,
 	CU_JIT_FALLBACK_STRATEGY,
-
 	CU_JIT_GENERATE_DEBUG_INFO,
-
 	CU_JIT_LOG_VERBOSE,
-
 	CU_JIT_GENERATE_LINE_INFO,
-
 	CU_JIT_CACHE_MODE,
-
 	CU_JIT_NEW_SM3X_OPT,
 	CU_JIT_FAST_COMPILE,
-
+	CU_JIT_GLOBAL_SYMBOL_NAMES,
+	CU_JIT_GLOBAL_SYMBOL_ADDRESSES,
+	CU_JIT_GLOBAL_SYMBOL_COUNT,
 	CU_JIT_NUM_OPTIONS
-
 } CUjit_option;
-
 typedef enum CUjit_target_enum {
-	CU_TARGET_COMPUTE_10 = 10,
-	CU_TARGET_COMPUTE_11 = 11,
-	CU_TARGET_COMPUTE_12 = 12,
-	CU_TARGET_COMPUTE_13 = 13,
 	CU_TARGET_COMPUTE_20 = 20,
 	CU_TARGET_COMPUTE_21 = 21,
 	CU_TARGET_COMPUTE_30 = 30,
@@ -1034,39 +867,30 @@ typedef enum CUjit_target_enum {
 	CU_TARGET_COMPUTE_53 = 53,
 	CU_TARGET_COMPUTE_60 = 60,
 	CU_TARGET_COMPUTE_61 = 61,
-	CU_TARGET_COMPUTE_62 = 62
+	CU_TARGET_COMPUTE_62 = 62,
+	CU_TARGET_COMPUTE_70 = 70,
+	CU_TARGET_COMPUTE_72 = 72,
+	CU_TARGET_COMPUTE_75 = 75,
+	CU_TARGET_COMPUTE_80 = 80
 } CUjit_target;
-
 typedef enum CUjit_fallback_enum {
 	CU_PREFER_PTX = 0,
-
 	CU_PREFER_BINARY
-
 } CUjit_fallback;
-
 typedef enum CUjit_cacheMode_enum {
 	CU_JIT_CACHE_OPTION_NONE = 0,
 	CU_JIT_CACHE_OPTION_CG,
 	CU_JIT_CACHE_OPTION_CA
 } CUjit_cacheMode;
-
 typedef enum CUjitInputType_enum {
-
 	CU_JIT_INPUT_CUBIN = 0,
-
 	CU_JIT_INPUT_PTX,
-
 	CU_JIT_INPUT_FATBINARY,
-
 	CU_JIT_INPUT_OBJECT,
-
 	CU_JIT_INPUT_LIBRARY,
-
 	CU_JIT_NUM_INPUT_TYPES
 } CUjitInputType;
-
 typedef struct CUlinkState_st* CUlinkState;
-
 typedef enum CUgraphicsRegisterFlags_enum {
 	CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00,
 	CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01,
@@ -1074,13 +898,11 @@ typedef enum CUgraphicsRegisterFlags_enum {
 	CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04,
 	CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08
 } CUgraphicsRegisterFlags;
-
 typedef enum CUgraphicsMapResourceFlags_enum {
 	CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00,
 	CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
 	CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
 } CUgraphicsMapResourceFlags;
-
 typedef enum CUarray_cubemap_face_enum {
 	CU_CUBEMAP_FACE_POSITIVE_X = 0x00,
 	CU_CUBEMAP_FACE_NEGATIVE_X = 0x01,
@@ -1089,166 +911,204 @@ typedef enum CUarray_cubemap_face_enum {
 	CU_CUBEMAP_FACE_POSITIVE_Z = 0x04,
 	CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05
 } CUarray_cubemap_face;
-
 typedef enum CUlimit_enum {
 	CU_LIMIT_STACK_SIZE = 0x00,
 	CU_LIMIT_PRINTF_FIFO_SIZE = 0x01,
 	CU_LIMIT_MALLOC_HEAP_SIZE = 0x02,
 	CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03,
 	CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04,
+	CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x05,
+	CU_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x06,
 	CU_LIMIT_MAX
 } CUlimit;
-
 typedef enum CUresourcetype_enum {
 	CU_RESOURCE_TYPE_ARRAY = 0x00,
 	CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01,
 	CU_RESOURCE_TYPE_LINEAR = 0x02,
 	CU_RESOURCE_TYPE_PITCH2D = 0x03
 } CUresourcetype;
-
+typedef void ( *CUhostFn)(void* userData);
+typedef enum CUaccessProperty_enum {
+	CU_ACCESS_PROPERTY_NORMAL = 0,
+	CU_ACCESS_PROPERTY_STREAMING = 1,
+	CU_ACCESS_PROPERTY_PERSISTING = 2
+} CUaccessProperty;
+typedef struct CUaccessPolicyWindow_st {
+	void* base_ptr;
+	size_t num_bytes;
+	float hitRatio;
+	CUaccessProperty hitProp;
+	CUaccessProperty missProp;
+} CUaccessPolicyWindow;
+typedef struct CUDA_KERNEL_NODE_PARAMS_st {
+	CUfunction func;
+	unsigned int gridDimX;
+	unsigned int gridDimY;
+	unsigned int gridDimZ;
+	unsigned int blockDimX;
+	unsigned int blockDimY;
+	unsigned int blockDimZ;
+	unsigned int sharedMemBytes;
+	void** kernelParams;
+	void** extra;
+} CUDA_KERNEL_NODE_PARAMS;
+typedef struct CUDA_MEMSET_NODE_PARAMS_st {
+	CUdeviceptr dst;
+	size_t pitch;
+	unsigned int value;
+	unsigned int elementSize;
+	size_t width;
+	size_t height;
+} CUDA_MEMSET_NODE_PARAMS;
+typedef struct CUDA_HOST_NODE_PARAMS_st {
+	CUhostFn fn;
+	void* userData;
+} CUDA_HOST_NODE_PARAMS;
+typedef enum CUgraphNodeType_enum {
+	CU_GRAPH_NODE_TYPE_KERNEL = 0,
+	CU_GRAPH_NODE_TYPE_MEMCPY = 1,
+	CU_GRAPH_NODE_TYPE_MEMSET = 2,
+	CU_GRAPH_NODE_TYPE_HOST = 3,
+	CU_GRAPH_NODE_TYPE_GRAPH = 4,
+	CU_GRAPH_NODE_TYPE_EMPTY = 5
+} CUgraphNodeType;
+typedef enum CUsynchronizationPolicy_enum {
+	CU_SYNC_POLICY_AUTO = 1,
+	CU_SYNC_POLICY_SPIN = 2,
+	CU_SYNC_POLICY_YIELD = 3,
+	CU_SYNC_POLICY_BLOCKING_SYNC = 4
+} CUsynchronizationPolicy;
+typedef enum CUkernelNodeAttrID_enum {
+	CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1,
+	CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE = 2
+} CUkernelNodeAttrID;
+typedef union CUkernelNodeAttrValue_union {
+	CUaccessPolicyWindow accessPolicyWindow;
+	int cooperative;
+} CUkernelNodeAttrValue;
+typedef enum CUstreamCaptureStatus_enum {
+	CU_STREAM_CAPTURE_STATUS_NONE = 0,
+	CU_STREAM_CAPTURE_STATUS_ACTIVE = 1,
+	CU_STREAM_CAPTURE_STATUS_INVALIDATED = 2
+} CUstreamCaptureStatus;
+typedef enum CUstreamCaptureMode_enum {
+	CU_STREAM_CAPTURE_MODE_GLOBAL = 0,
+	CU_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1,
+	CU_STREAM_CAPTURE_MODE_RELAXED = 2
+} CUstreamCaptureMode;
+typedef enum CUstreamAttrID_enum {
+	CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1,
+	CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3
+} CUstreamAttrID;
+typedef union CUstreamAttrValue_union {
+	CUaccessPolicyWindow accessPolicyWindow;
+	CUsynchronizationPolicy syncPolicy;
+} CUstreamAttrValue;
 typedef enum cudaError_enum {
-
 	CUDA_SUCCESS = 0,
-
 	CUDA_ERROR_INVALID_VALUE = 1,
-
 	CUDA_ERROR_OUT_OF_MEMORY = 2,
-
 	CUDA_ERROR_NOT_INITIALIZED = 3,
-
 	CUDA_ERROR_DEINITIALIZED = 4,
-
 	CUDA_ERROR_PROFILER_DISABLED = 5,
-
 	CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6,
-
 	CUDA_ERROR_PROFILER_ALREADY_STARTED = 7,
-
 	CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8,
-
 	CUDA_ERROR_NO_DEVICE = 100,
-
 	CUDA_ERROR_INVALID_DEVICE = 101,
-
 	CUDA_ERROR_INVALID_IMAGE = 200,
 	CUDA_ERROR_INVALID_CONTEXT = 201,
 	CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202,
-
 	CUDA_ERROR_MAP_FAILED = 205,
-
 	CUDA_ERROR_UNMAP_FAILED = 206,
-
 	CUDA_ERROR_ARRAY_IS_MAPPED = 207,
-
 	CUDA_ERROR_ALREADY_MAPPED = 208,
-
 	CUDA_ERROR_NO_BINARY_FOR_GPU = 209,
-
 	CUDA_ERROR_ALREADY_ACQUIRED = 210,
-
 	CUDA_ERROR_NOT_MAPPED = 211,
-
 	CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212,
-
 	CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213,
-
 	CUDA_ERROR_ECC_UNCORRECTABLE = 214,
-
 	CUDA_ERROR_UNSUPPORTED_LIMIT = 215,
-
 	CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216,
-
 	CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217,
-
 	CUDA_ERROR_INVALID_PTX = 218,
-
 	CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219,
-
 	CUDA_ERROR_NVLINK_UNCORRECTABLE = 220,
-
+	CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221,
 	CUDA_ERROR_INVALID_SOURCE = 300,
-
 	CUDA_ERROR_FILE_NOT_FOUND = 301,
-
 	CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
-
 	CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303,
-
 	CUDA_ERROR_OPERATING_SYSTEM = 304,
-
 	CUDA_ERROR_INVALID_HANDLE = 400,
-
+	CUDA_ERROR_ILLEGAL_STATE = 401,
 	CUDA_ERROR_NOT_FOUND = 500,
-
 	CUDA_ERROR_NOT_READY = 600,
 	CUDA_ERROR_ILLEGAL_ADDRESS = 700,
 	CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701,
 	CUDA_ERROR_LAUNCH_TIMEOUT = 702,
-
 	CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703,
-
 	CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704,
-
 	CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705,
-
 	CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708,
-
 	CUDA_ERROR_CONTEXT_IS_DESTROYED = 709,
-
 	CUDA_ERROR_ASSERT = 710,
-
 	CUDA_ERROR_TOO_MANY_PEERS = 711,
-
 	CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
-
 	CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713,
 	CUDA_ERROR_HARDWARE_STACK_ERROR = 714,
-
 	CUDA_ERROR_ILLEGAL_INSTRUCTION = 715,
 	CUDA_ERROR_MISALIGNED_ADDRESS = 716,
 	CUDA_ERROR_INVALID_ADDRESS_SPACE = 717,
-
 	CUDA_ERROR_INVALID_PC = 718,
 	CUDA_ERROR_LAUNCH_FAILED = 719,
-
+	CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720,
 	CUDA_ERROR_NOT_PERMITTED = 800,
-
 	CUDA_ERROR_NOT_SUPPORTED = 801,
-
+	CUDA_ERROR_SYSTEM_NOT_READY = 802,
+	CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803,
+	CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804,
+	CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900,
+	CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901,
+	CUDA_ERROR_STREAM_CAPTURE_MERGE = 902,
+	CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903,
+	CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904,
+	CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905,
+	CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906,
+	CUDA_ERROR_CAPTURED_EVENT = 907,
+	CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908,
+	CUDA_ERROR_TIMEOUT = 909,
+	CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910,
 	CUDA_ERROR_UNKNOWN = 999
 } CUresult;
-
 typedef enum CUdevice_P2PAttribute_enum {
 	CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01,
 	CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02,
-	CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03
+	CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03,
+	CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED = 0x04,
+	CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED = 0x04
 } CUdevice_P2PAttribute;
 typedef void ( *CUstreamCallback)(CUstream hStream, CUresult status, void* userData);
-
 typedef size_t ( *CUoccupancyB2DSize)(int blockSize);
 typedef struct CUDA_MEMCPY2D_st {
 	size_t srcXInBytes;
 	size_t srcY;
-
 	CUmemorytype srcMemoryType;
 	const void* srcHost;
 	CUdeviceptr srcDevice;
 	CUarray srcArray;
 	size_t srcPitch;
-
 	size_t dstXInBytes;
 	size_t dstY;
-
 	CUmemorytype dstMemoryType;
 	void* dstHost;
 	CUdeviceptr dstDevice;
 	CUarray dstArray;
 	size_t dstPitch;
-
 	size_t WidthInBytes;
 	size_t Height;
 } CUDA_MEMCPY2D;
-
 typedef struct CUDA_MEMCPY3D_st {
 	size_t srcXInBytes;
 	size_t srcY;
@@ -1261,7 +1121,6 @@ typedef struct CUDA_MEMCPY3D_st {
 	void* reserved0;
 	size_t srcPitch;
 	size_t srcHeight;
-
 	size_t dstXInBytes;
 	size_t dstY;
 	size_t dstZ;
@@ -1273,12 +1132,10 @@ typedef struct CUDA_MEMCPY3D_st {
 	void* reserved1;
 	size_t dstPitch;
 	size_t dstHeight;
-
 	size_t WidthInBytes;
 	size_t Height;
 	size_t Depth;
 } CUDA_MEMCPY3D;
-
 typedef struct CUDA_MEMCPY3D_PEER_st {
 	size_t srcXInBytes;
 	size_t srcY;
@@ -1291,7 +1148,6 @@ typedef struct CUDA_MEMCPY3D_PEER_st {
 	CUcontext srcContext;
 	size_t srcPitch;
 	size_t srcHeight;
-
 	size_t dstXInBytes;
 	size_t dstY;
 	size_t dstZ;
@@ -1303,32 +1159,26 @@ typedef struct CUDA_MEMCPY3D_PEER_st {
 	CUcontext dstContext;
 	size_t dstPitch;
 	size_t dstHeight;
-
 	size_t WidthInBytes;
 	size_t Height;
 	size_t Depth;
 } CUDA_MEMCPY3D_PEER;
-
 typedef struct CUDA_ARRAY_DESCRIPTOR_st {
 	size_t Width;
 	size_t Height;
-
 	CUarray_format Format;
 	unsigned int NumChannels;
 } CUDA_ARRAY_DESCRIPTOR;
-
 typedef struct CUDA_ARRAY3D_DESCRIPTOR_st {
 	size_t Width;
 	size_t Height;
 	size_t Depth;
-
 	CUarray_format Format;
 	unsigned int NumChannels;
 	unsigned int Flags;
 } CUDA_ARRAY3D_DESCRIPTOR;
 typedef struct CUDA_RESOURCE_DESC_st {
 	CUresourcetype resType;
-
 	union {
 		struct {
 			CUarray hArray;
@@ -1354,10 +1204,8 @@ typedef struct CUDA_RESOURCE_DESC_st {
 			int reserved[32];
 		} reserved;
 	} res;
-
 	unsigned int flags;
 } CUDA_RESOURCE_DESC;
-
 typedef struct CUDA_TEXTURE_DESC_st {
 	CUaddress_mode addressMode[3];
 	CUfilter_mode filterMode;
@@ -1370,7 +1218,6 @@ typedef struct CUDA_TEXTURE_DESC_st {
 	float borderColor[4];
 	int reserved[12];
 } CUDA_TEXTURE_DESC;
-
 typedef enum CUresourceViewFormat_enum {
 	CU_RES_VIEW_FORMAT_NONE = 0x00,
 	CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01,
@@ -1408,7 +1255,6 @@ typedef enum CUresourceViewFormat_enum {
 	CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21,
 	CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22
 } CUresourceViewFormat;
-
 typedef struct CUDA_RESOURCE_VIEW_DESC_st {
 	CUresourceViewFormat format;
 	size_t width;
@@ -1420,11 +1266,175 @@ typedef struct CUDA_RESOURCE_VIEW_DESC_st {
 	unsigned int lastLayer;
 	unsigned int reserved[16];
 } CUDA_RESOURCE_VIEW_DESC;
-
 typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st {
 	unsigned long long p2pToken;
 	unsigned int vaSpaceToken;
 } CUDA_POINTER_ATTRIBUTE_P2P_TOKENS;
+typedef struct CUDA_LAUNCH_PARAMS_st {
+	CUfunction function;
+	unsigned int gridDimX;
+	unsigned int gridDimY;
+	unsigned int gridDimZ;
+	unsigned int blockDimX;
+	unsigned int blockDimY;
+	unsigned int blockDimZ;
+	unsigned int sharedMemBytes;
+	CUstream hStream;
+	void** kernelParams;
+} CUDA_LAUNCH_PARAMS;
+typedef enum CUexternalMemoryHandleType_enum {
+	CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1,
+	CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2,
+	CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3,
+	CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4,
+	CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5,
+	CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6,
+	CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
+	CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8
+} CUexternalMemoryHandleType;
+typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
+	CUexternalMemoryHandleType type;
+	union {
+		int fd;
+		struct {
+			void* handle;
+			const void* name;
+		} win32;
+		const void* nvSciBufObject;
+	} handle;
+	unsigned long long size;
+	unsigned int flags;
+	unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
+typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
+	unsigned long long offset;
+	unsigned long long size;
+	unsigned int flags;
+	unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
+typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
+	unsigned long long offset;
+	CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+	unsigned int numLevels;
+	unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
+typedef enum CUexternalSemaphoreHandleType_enum {
+	CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1,
+	CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2,
+	CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3,
+	CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4,
+	CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE = 5,
+	CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC = 6,
+	CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX = 7,
+	CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8
+} CUexternalSemaphoreHandleType;
+typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
+	CUexternalSemaphoreHandleType type;
+	union {
+		int fd;
+		struct {
+			void* handle;
+			const void* name;
+		} win32;
+		const void* nvSciSyncObj;
+	} handle;
+	unsigned int flags;
+	unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
+typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st {
+	struct {
+		struct {
+			unsigned long long value;
+		} fence;
+		union {
+			void* fence;
+			unsigned long long reserved;
+		} nvSciSync;
+		struct {
+			unsigned long long key;
+		} keyedMutex;
+		unsigned int reserved[12];
+	} params;
+	unsigned int flags;
+	unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS;
+typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
+	struct {
+		struct {
+			unsigned long long value;
+		} fence;
+		union {
+			void* fence;
+			unsigned long long reserved;
+		} nvSciSync;
+		struct {
+			unsigned long long key;
+			unsigned int timeoutMs;
+		} keyedMutex;
+		unsigned int reserved[10];
+	} params;
+	unsigned int flags;
+	unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS;
+typedef unsigned long long CUmemGenericAllocationHandle;
+typedef enum CUmemAllocationHandleType_enum {
+	CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1,
+	CU_MEM_HANDLE_TYPE_WIN32 = 0x2,
+	CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4,
+	CU_MEM_HANDLE_TYPE_MAX = 0xFFFFFFFF
+} CUmemAllocationHandleType;
+typedef enum CUmemAccess_flags_enum {
+	CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0,
+	CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1,
+	CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3,
+	CU_MEM_ACCESS_FLAGS_PROT_MAX = 0xFFFFFFFF
+} CUmemAccess_flags;
+typedef enum CUmemLocationType_enum {
+	CU_MEM_LOCATION_TYPE_INVALID = 0x0,
+	CU_MEM_LOCATION_TYPE_DEVICE = 0x1,
+	CU_MEM_LOCATION_TYPE_MAX = 0xFFFFFFFF
+} CUmemLocationType;
+typedef enum CUmemAllocationType_enum {
+	CU_MEM_ALLOCATION_TYPE_INVALID = 0x0,
+	CU_MEM_ALLOCATION_TYPE_PINNED = 0x1,
+	CU_MEM_ALLOCATION_TYPE_MAX = 0xFFFFFFFF
+} CUmemAllocationType;
+typedef enum CUmemAllocationGranularity_flags_enum {
+	CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0,
+	CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1
+} CUmemAllocationGranularity_flags;
+typedef struct CUmemLocation_st {
+	CUmemLocationType type;
+	int id;
+} CUmemLocation;
+typedef enum CUmemAllocationCompType_enum {
+	CU_MEM_ALLOCATION_COMP_NONE = 0x0,
+	CU_MEM_ALLOCATION_COMP_GENERIC = 0x1
+} CUmemAllocationCompType;
+typedef struct CUmemAllocationProp_st {
+	CUmemAllocationType type;
+	CUmemAllocationHandleType requestedHandleTypes;
+	CUmemLocation location;
+	void* win32HandleMetaData;
+	struct {
+		unsigned char compressionType;
+		unsigned char gpuDirectRDMACapable;
+		unsigned char reserved[6];
+	} allocFlags;
+} CUmemAllocationProp;
+typedef struct CUmemAccessDesc_st {
+	CUmemLocation location;
+	CUmemAccess_flags flags;
+} CUmemAccessDesc;
+typedef enum CUgraphExecUpdateResult_enum {
+	CU_GRAPH_EXEC_UPDATE_SUCCESS = 0x0,
+	CU_GRAPH_EXEC_UPDATE_ERROR = 0x1,
+	CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED = 0x2,
+	CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED = 0x3,
+	CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED = 0x4,
+	CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED = 0x5,
+	CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED = 0x6
+} CUgraphExecUpdateResult;
 CUresult cuGetErrorString(CUresult error, const char** pStr);
 CUresult cuGetErrorName(CUresult error, const char** pStr);
 CUresult cuInit(unsigned int Flags);
@@ -1432,10 +1442,12 @@ CUresult cuDriverGetVersion(int* driverVersion);
 CUresult cuDeviceGet(CUdevice* device, int ordinal);
 CUresult cuDeviceGetCount(int* count);
 CUresult cuDeviceGetName(char* name, int len, CUdevice dev);
+CUresult cuDeviceGetUuid(CUuuid* uuid, CUdevice dev);
 CUresult cuDeviceTotalMem(size_t* bytes, CUdevice dev);
 CUresult cuDeviceGetAttribute(int* pi, CUdevice_attribute attrib, CUdevice dev);
-CUresult cuDeviceGetProperties(CUdevprop* prop, CUdevice dev);
-CUresult cuDeviceComputeCapability(int* major, int* minor, CUdevice dev);
+CUresult cuDeviceGetNvSciSyncAttributes(void* nvSciSyncAttrList, CUdevice dev, int flags);
+__attribute__((deprecated)) CUresult cuDeviceGetProperties(CUdevprop* prop, CUdevice dev);
+__attribute__((deprecated)) CUresult cuDeviceComputeCapability(int* major, int* minor, CUdevice dev);
 CUresult cuDevicePrimaryCtxRetain(CUcontext* pctx, CUdevice dev);
 CUresult cuDevicePrimaryCtxRelease(CUdevice dev);
 CUresult cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
@@ -1458,8 +1470,9 @@ CUresult cuCtxGetSharedMemConfig(CUsharedconfig* pConfig);
 CUresult cuCtxSetSharedMemConfig(CUsharedconfig config);
 CUresult cuCtxGetApiVersion(CUcontext ctx, unsigned int* version);
 CUresult cuCtxGetStreamPriorityRange(int* leastPriority, int* greatestPriority);
-CUresult cuCtxAttach(CUcontext* pctx, unsigned int flags);
-CUresult cuCtxDetach(CUcontext ctx);
+CUresult cuCtxResetPersistingL2Cache(void);
+__attribute__((deprecated)) CUresult cuCtxAttach(CUcontext* pctx, unsigned int flags);
+__attribute__((deprecated)) CUresult cuCtxDetach(CUcontext ctx);
 CUresult cuModuleLoad(CUmodule* module, const char* fname);
 CUresult cuModuleLoadData(CUmodule* module, const void* image);
 CUresult cuModuleLoadDataEx(CUmodule* module, const void* image, unsigned int numOptions, CUjit_option* options, void** optionValues);
@@ -1473,10 +1486,10 @@ CUresult
 cuLinkCreate(unsigned int numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut);
 CUresult
 cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size, const char* name,
-              unsigned int numOptions, CUjit_option* options, void** optionValues);
+                 unsigned int numOptions, CUjit_option* options, void** optionValues);
 CUresult
 cuLinkAddFile(CUlinkState state, CUjitInputType type, const char* path,
-              unsigned int numOptions, CUjit_option* options, void** optionValues);
+                 unsigned int numOptions, CUjit_option* options, void** optionValues);
 CUresult
 cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut);
 CUresult
@@ -1545,6 +1558,19 @@ CUresult cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR* pArrayDescriptor, CUarr
 CUresult cuMipmappedArrayCreate(CUmipmappedArray* pHandle, const CUDA_ARRAY3D_DESCRIPTOR* pMipmappedArrayDesc, unsigned int numMipmapLevels);
 CUresult cuMipmappedArrayGetLevel(CUarray* pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
 CUresult cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
+CUresult cuMemAddressReserve(CUdeviceptr* ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags);
+CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size);
+CUresult cuMemCreate(CUmemGenericAllocationHandle* handle, size_t size, const CUmemAllocationProp* prop, unsigned long long flags);
+CUresult cuMemRelease(CUmemGenericAllocationHandle handle);
+CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags);
+CUresult cuMemUnmap(CUdeviceptr ptr, size_t size);
+CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc* desc, size_t count);
+CUresult cuMemGetAccess(unsigned long long* flags, const CUmemLocation* location, CUdeviceptr ptr);
+CUresult cuMemExportToShareableHandle(void* shareableHandle, CUmemGenericAllocationHandle handle, CUmemAllocationHandleType handleType, unsigned long long flags);
+CUresult cuMemImportFromShareableHandle(CUmemGenericAllocationHandle* handle, void* osHandle, CUmemAllocationHandleType shHandleType);
+CUresult cuMemGetAllocationGranularity(size_t* granularity, const CUmemAllocationProp* prop, CUmemAllocationGranularity_flags option);
+CUresult cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp* prop, CUmemGenericAllocationHandle handle);
+CUresult cuMemRetainAllocationHandle(CUmemGenericAllocationHandle* handle, void* addr);
 CUresult cuPointerGetAttribute(void* data, CUpointer_attribute attribute, CUdeviceptr ptr);
 CUresult cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream);
 CUresult cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device);
@@ -1556,22 +1582,44 @@ CUresult cuStreamCreate(CUstream* phStream, unsigned int Flags);
 CUresult cuStreamCreateWithPriority(CUstream* phStream, unsigned int flags, int priority);
 CUresult cuStreamGetPriority(CUstream hStream, int* priority);
 CUresult cuStreamGetFlags(CUstream hStream, unsigned int* flags);
+CUresult cuStreamGetCtx(CUstream hStream, CUcontext* pctx);
 CUresult cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
 CUresult cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void* userData, unsigned int flags);
+CUresult cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode);
+CUresult cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode* mode);
+CUresult cuStreamEndCapture(CUstream hStream, CUgraph* phGraph);
+CUresult cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus* captureStatus);
+CUresult cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus* captureStatus, cuuint64_t* id);
 CUresult cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
 CUresult cuStreamQuery(CUstream hStream);
 CUresult cuStreamSynchronize(CUstream hStream);
 CUresult cuStreamDestroy(CUstream hStream);
+CUresult cuStreamCopyAttributes(CUstream dst, CUstream src);
+CUresult cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr,
+                              CUstreamAttrValue* value_out);
+CUresult cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr,
+                              const CUstreamAttrValue* value);
 CUresult cuEventCreate(CUevent* phEvent, unsigned int Flags);
 CUresult cuEventRecord(CUevent hEvent, CUstream hStream);
 CUresult cuEventQuery(CUevent hEvent);
 CUresult cuEventSynchronize(CUevent hEvent);
 CUresult cuEventDestroy(CUevent hEvent);
 CUresult cuEventElapsedTime(float* pMilliseconds, CUevent hStart, CUevent hEnd);
+CUresult cuImportExternalMemory(CUexternalMemory* extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC* memHandleDesc);
+CUresult cuExternalMemoryGetMappedBuffer(CUdeviceptr* devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC* bufferDesc);
+CUresult cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray* mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC* mipmapDesc);
+CUresult cuDestroyExternalMemory(CUexternalMemory extMem);
+CUresult cuImportExternalSemaphore(CUexternalSemaphore* extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC* semHandleDesc);
+CUresult cuSignalExternalSemaphoresAsync(const CUexternalSemaphore* extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray, unsigned int numExtSems, CUstream stream);
+CUresult cuWaitExternalSemaphoresAsync(const CUexternalSemaphore* extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray, unsigned int numExtSems, CUstream stream);
+CUresult cuDestroyExternalSemaphore(CUexternalSemaphore extSem);
 CUresult cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+CUresult cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
 CUresult cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+CUresult cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
 CUresult cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams* paramArray, unsigned int flags);
 CUresult cuFuncGetAttribute(int* pi, CUfunction_attribute attrib, CUfunction hfunc);
+CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value);
 CUresult cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
 CUresult cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
 CUresult cuLaunchKernel(CUfunction f,
@@ -1585,49 +1633,103 @@ CUresult cuLaunchKernel(CUfunction f,
                         CUstream hStream,
                         void** kernelParams,
                         void** extra);
-CUresult cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
-CUresult cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
-CUresult cuParamSetSize(CUfunction hfunc, unsigned int numbytes);
-CUresult cuParamSeti(CUfunction hfunc, int offset, unsigned int value);
-CUresult cuParamSetf(CUfunction hfunc, int offset, float value);
-CUresult cuParamSetv(CUfunction hfunc, int offset, void* ptr, unsigned int numbytes);
-CUresult cuLaunch(CUfunction f);
-CUresult cuLaunchGrid(CUfunction f, int grid_width, int grid_height);
-CUresult cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
-CUresult cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
+CUresult cuLaunchCooperativeKernel(CUfunction f,
+                                   unsigned int gridDimX,
+                                   unsigned int gridDimY,
+                                   unsigned int gridDimZ,
+                                   unsigned int blockDimX,
+                                   unsigned int blockDimY,
+                                   unsigned int blockDimZ,
+                                   unsigned int sharedMemBytes,
+                                   CUstream hStream,
+                                   void** kernelParams);
+CUresult cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS* launchParamsList, unsigned int numDevices, unsigned int flags);
+CUresult cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void* userData);
+__attribute__((deprecated)) CUresult cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
+__attribute__((deprecated)) CUresult cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
+__attribute__((deprecated)) CUresult cuParamSetSize(CUfunction hfunc, unsigned int numbytes);
+__attribute__((deprecated)) CUresult cuParamSeti(CUfunction hfunc, int offset, unsigned int value);
+__attribute__((deprecated)) CUresult cuParamSetf(CUfunction hfunc, int offset, float value);
+__attribute__((deprecated)) CUresult cuParamSetv(CUfunction hfunc, int offset, void* ptr, unsigned int numbytes);
+__attribute__((deprecated)) CUresult cuLaunch(CUfunction f);
+__attribute__((deprecated)) CUresult cuLaunchGrid(CUfunction f, int grid_width, int grid_height);
+__attribute__((deprecated)) CUresult cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
+__attribute__((deprecated)) CUresult cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
+CUresult cuGraphCreate(CUgraph* phGraph, unsigned int flags);
+CUresult cuGraphAddKernelNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS* nodeParams);
+CUresult cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS* nodeParams);
+CUresult cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS* nodeParams);
+CUresult cuGraphAddMemcpyNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_MEMCPY3D* copyParams, CUcontext ctx);
+CUresult cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D* nodeParams);
+CUresult cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D* nodeParams);
+CUresult cuGraphAddMemsetNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS* memsetParams, CUcontext ctx);
+CUresult cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS* nodeParams);
+CUresult cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS* nodeParams);
+CUresult cuGraphAddHostNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS* nodeParams);
+CUresult cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS* nodeParams);
+CUresult cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS* nodeParams);
+CUresult cuGraphAddChildGraphNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies, CUgraph childGraph);
+CUresult cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph* phGraph);
+CUresult cuGraphAddEmptyNode(CUgraphNode* phGraphNode, CUgraph hGraph, const CUgraphNode* dependencies, size_t numDependencies);
+CUresult cuGraphClone(CUgraph* phGraphClone, CUgraph originalGraph);
+CUresult cuGraphNodeFindInClone(CUgraphNode* phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph);
+CUresult cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType* type);
+CUresult cuGraphGetNodes(CUgraph hGraph, CUgraphNode* nodes, size_t* numNodes);
+CUresult cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode* rootNodes, size_t* numRootNodes);
+CUresult cuGraphGetEdges(CUgraph hGraph, CUgraphNode* from, CUgraphNode* to, size_t* numEdges);
+CUresult cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode* dependencies, size_t* numDependencies);
+CUresult cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode* dependentNodes, size_t* numDependentNodes);
+CUresult cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode* from, const CUgraphNode* to, size_t numDependencies);
+CUresult cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode* from, const CUgraphNode* to, size_t numDependencies);
+CUresult cuGraphDestroyNode(CUgraphNode hNode);
+CUresult cuGraphInstantiate(CUgraphExec* phGraphExec, CUgraph hGraph, CUgraphNode* phErrorNode, char* logBuffer, size_t bufferSize);
+CUresult cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS* nodeParams);
+CUresult cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D* copyParams, CUcontext ctx);
+CUresult cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS* memsetParams, CUcontext ctx);
+CUresult cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS* nodeParams);
+CUresult cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream);
+CUresult cuGraphExecDestroy(CUgraphExec hGraphExec);
+CUresult cuGraphDestroy(CUgraph hGraph);
+CUresult cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode* hErrorNode_out, CUgraphExecUpdateResult* updateResult_out);
+CUresult cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src);
+CUresult cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                                       CUkernelNodeAttrValue* value_out);
+CUresult cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                                       const CUkernelNodeAttrValue* value);
 CUresult cuOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
 CUresult cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
 CUresult cuOccupancyMaxPotentialBlockSize(int* minGridSize, int* blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
 CUresult cuOccupancyMaxPotentialBlockSizeWithFlags(int* minGridSize, int* blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
-CUresult cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
-CUresult cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
-CUresult cuTexRefSetAddress(size_t* ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
-CUresult cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR* desc, CUdeviceptr dptr, size_t Pitch);
-CUresult cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
-CUresult cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
-CUresult cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
-CUresult cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm);
-CUresult cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias);
-CUresult cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
-CUresult cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso);
-CUresult cuTexRefSetBorderColor(CUtexref hTexRef, float* pBorderColor);
-CUresult cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
-CUresult cuTexRefGetAddress(CUdeviceptr* pdptr, CUtexref hTexRef);
-CUresult cuTexRefGetArray(CUarray* phArray, CUtexref hTexRef);
-CUresult cuTexRefGetMipmappedArray(CUmipmappedArray* phMipmappedArray, CUtexref hTexRef);
-CUresult cuTexRefGetAddressMode(CUaddress_mode* pam, CUtexref hTexRef, int dim);
-CUresult cuTexRefGetFilterMode(CUfilter_mode* pfm, CUtexref hTexRef);
-CUresult cuTexRefGetFormat(CUarray_format* pFormat, int* pNumChannels, CUtexref hTexRef);
-CUresult cuTexRefGetMipmapFilterMode(CUfilter_mode* pfm, CUtexref hTexRef);
-CUresult cuTexRefGetMipmapLevelBias(float* pbias, CUtexref hTexRef);
-CUresult cuTexRefGetMipmapLevelClamp(float* pminMipmapLevelClamp, float* pmaxMipmapLevelClamp, CUtexref hTexRef);
-CUresult cuTexRefGetMaxAnisotropy(int* pmaxAniso, CUtexref hTexRef);
-CUresult cuTexRefGetBorderColor(float* pBorderColor, CUtexref hTexRef);
-CUresult cuTexRefGetFlags(unsigned int* pFlags, CUtexref hTexRef);
-CUresult cuTexRefCreate(CUtexref* pTexRef);
-CUresult cuTexRefDestroy(CUtexref hTexRef);
-CUresult cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
-CUresult cuSurfRefGetArray(CUarray* phArray, CUsurfref hSurfRef);
+CUresult cuOccupancyAvailableDynamicSMemPerBlock(size_t* dynamicSmemSize, CUfunction func, int numBlocks, int blockSize);
+__attribute__((deprecated)) CUresult cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
+__attribute__((deprecated)) CUresult cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
+__attribute__((deprecated)) CUresult cuTexRefSetAddress(size_t* ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
+__attribute__((deprecated)) CUresult cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR* desc, CUdeviceptr dptr, size_t Pitch);
+__attribute__((deprecated)) CUresult cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
+__attribute__((deprecated)) CUresult cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
+__attribute__((deprecated)) CUresult cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+__attribute__((deprecated)) CUresult cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+__attribute__((deprecated)) CUresult cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias);
+__attribute__((deprecated)) CUresult cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
+__attribute__((deprecated)) CUresult cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso);
+__attribute__((deprecated)) CUresult cuTexRefSetBorderColor(CUtexref hTexRef, float* pBorderColor);
+__attribute__((deprecated)) CUresult cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
+__attribute__((deprecated)) CUresult cuTexRefGetAddress(CUdeviceptr* pdptr, CUtexref hTexRef);
+__attribute__((deprecated)) CUresult cuTexRefGetArray(CUarray* phArray, CUtexref hTexRef);
+__attribute__((deprecated)) CUresult cuTexRefGetMipmappedArray(CUmipmappedArray* phMipmappedArray, CUtexref hTexRef);
+__attribute__((deprecated)) CUresult cuTexRefGetAddressMode(CUaddress_mode* pam, CUtexref hTexRef, int dim);
+__attribute__((deprecated)) CUresult cuTexRefGetFilterMode(CUfilter_mode* pfm, CUtexref hTexRef);
+__attribute__((deprecated)) CUresult cuTexRefGetFormat(CUarray_format* pFormat, int* pNumChannels, CUtexref hTexRef);
+__attribute__((deprecated)) CUresult cuTexRefGetMipmapFilterMode(CUfilter_mode* pfm, CUtexref hTexRef);
+__attribute__((deprecated)) CUresult cuTexRefGetMipmapLevelBias(float* pbias, CUtexref hTexRef);
+__attribute__((deprecated)) CUresult cuTexRefGetMipmapLevelClamp(float* pminMipmapLevelClamp, float* pmaxMipmapLevelClamp, CUtexref hTexRef);
+__attribute__((deprecated)) CUresult cuTexRefGetMaxAnisotropy(int* pmaxAniso, CUtexref hTexRef);
+__attribute__((deprecated)) CUresult cuTexRefGetBorderColor(float* pBorderColor, CUtexref hTexRef);
+__attribute__((deprecated)) CUresult cuTexRefGetFlags(unsigned int* pFlags, CUtexref hTexRef);
+__attribute__((deprecated)) CUresult cuTexRefCreate(CUtexref* pTexRef);
+__attribute__((deprecated)) CUresult cuTexRefDestroy(CUtexref hTexRef);
+__attribute__((deprecated)) CUresult cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
+__attribute__((deprecated)) CUresult cuSurfRefGetArray(CUarray* phArray, CUsurfref hSurfRef);
 CUresult cuTexObjectCreate(CUtexObject* pTexObject, const CUDA_RESOURCE_DESC* pResDesc, const CUDA_TEXTURE_DESC* pTexDesc, const CUDA_RESOURCE_VIEW_DESC* pResViewDesc);
 CUresult cuTexObjectDestroy(CUtexObject texObject);
 CUresult cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC* pResDesc, CUtexObject texObject);
@@ -1637,9 +1739,9 @@ CUresult cuSurfObjectCreate(CUsurfObject* pSurfObject, const CUDA_RESOURCE_DESC*
 CUresult cuSurfObjectDestroy(CUsurfObject surfObject);
 CUresult cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC* pResDesc, CUsurfObject surfObject);
 CUresult cuDeviceCanAccessPeer(int* canAccessPeer, CUdevice dev, CUdevice peerDev);
-CUresult cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice);
 CUresult cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags);
 CUresult cuCtxDisablePeerAccess(CUcontext peerContext);
+CUresult cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice);
 CUresult cuGraphicsUnregisterResource(CUgraphicsResource resource);
 CUresult cuGraphicsSubResourceGetMappedArray(CUarray* pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
 CUresult cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray* pMipmappedArray, CUgraphicsResource resource);
@@ -1647,5 +1749,5 @@ CUresult cuGraphicsResourceGetMappedPointer(CUdeviceptr* pDevPtr, size_t* pSize,
 CUresult cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
 CUresult cuGraphicsMapResources(unsigned int count, CUgraphicsResource* resources, CUstream hStream);
 CUresult cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource* resources, CUstream hStream);
-
 CUresult cuGetExportTable(const void** ppExportTable, const CUuuid* pExportTableId);
+CUresult cuFuncGetModule(CUmodule* hmod, CUfunction hfunc);
diff --git a/cmd/genlib/errors.go b/cmd/genlib/errors.go
new file mode 100644
index 0000000..beb61b8
--- /dev/null
+++ b/cmd/genlib/errors.go
@@ -0,0 +1,5 @@
+package main
+
+// errors for reporting. Use global variables for all the things!
+
+var errs = make(map[string]struct{})
diff --git a/cmd/genlib/generateEnums.go b/cmd/genlib/generateEnums.go
new file mode 100644
index 0000000..d61e2b7
--- /dev/null
+++ b/cmd/genlib/generateEnums.go
@@ -0,0 +1,74 @@
+package main
+
+import (
+	"fmt"
+	"io"
+	"strings"
+
+	"github.com/gorgonia/bindgen"
+	"modernc.org/cc"
+)
+
+// genCUresult represents a list of enums we want to generate
+var genCUreuslt = map[bindgen.TypeKey]struct{}{
+	{Kind: cc.Enum, Name: "CUresult"}: {},
+}
+
+var cuResultMappings = map[bindgen.TypeKey]string{
+	{Kind: cc.Enum, Name: "CUresult"}: "cuResult",
+}
+
+func goRenameCUResult(a string) string {
+	a = strings.TrimPrefix(a, "CUDA_")
+	a = strings.TrimPrefix(a, "ERROR_")
+	splits := strings.Split(a, "_")
+	for i, s := range splits {
+		splits[i] = strings.Title(strings.ToLower(s))
+	}
+	return strings.Join(splits, "")
+}
+
+func generateResultEnums(f io.Writer) {
+	t, err := bindgen.Parse(bindgen.Model(), "cuda.h")
+	if err != nil {
+		panic(err)
+	}
+
+	enums := func(decl *cc.Declarator) bool {
+		name := bindgen.NameOf(decl)
+		kind := decl.Type.Kind()
+		tk := bindgen.TypeKey{Kind: kind, Name: name}
+		if _, ok := genCUreuslt[tk]; ok {
+			return true
+		}
+		return false
+	}
+	decls, err := bindgen.Get(t, enums)
+	if err != nil {
+		panic(err)
+	}
+
+	var m []string
+	for _, d := range decls {
+		e := d.(*bindgen.Enum)
+		tk := bindgen.TypeKey{Kind: cc.Enum, Name: e.Name}
+		fmt.Fprintf(f, "type %v int\nconst (\n", cuResultMappings[tk])
+
+		// then write the const definitions:
+		// 	const(...)
+
+		for _, a := range e.Type.EnumeratorList() {
+			enumName := string(a.DefTok.S())
+			goName := goRenameCUResult(enumName)
+			m = append(m, goName)
+			fmt.Fprintf(f, "%v %v = C.%v\n", goName, cuResultMappings[tk], enumName)
+		}
+		f.Write([]byte(")\n"))
+	}
+	fmt.Fprintf(f, "var resString = map[cuResult]string{\n")
+	for _, s := range m {
+		fmt.Fprintf(f, "%v: %q,\n", s, s)
+	}
+	f.Write([]byte("}\n"))
+
+}
diff --git a/cmd/genlib/main.go b/cmd/genlib/main.go
index 47ba564..e1e0efe 100644
--- a/cmd/genlib/main.go
+++ b/cmd/genlib/main.go
@@ -1,6 +1,7 @@
 package main
 
 import (
+	"io/ioutil"
 	"log"
 	"os"
 	"os/exec"
@@ -10,60 +11,116 @@ import (
 var pkgloc string
 var apiFile string
 var ctxFile string
+var resultFile string
 
 func init() {
 	gopath := os.Getenv("GOPATH")
 	pkgloc = path.Join(gopath, "src/gorgonia.org/cu")
 	apiFile = path.Join(pkgloc, "api.go")
 	ctxFile = path.Join(pkgloc, "ctx_api.go")
+	resultFile = path.Join(pkgloc, "result.go")
 }
 
 func generateAPIFile(gss []*GoSignature) {
+	var original []byte
+	if _, err := os.Stat(apiFile); err == nil {
+		if original, err = ioutil.ReadFile(apiFile); err != nil {
+			panic(err)
+		}
+	}
+
 	f, err := os.Create(apiFile)
 	if err != nil {
 		panic(err)
 	}
 	defer f.Close()
 
+	defer func(original []byte, f *os.File) {
+		if r := recover(); r != nil {
+			f.Truncate(0)
+			f.Seek(0, 0)
+			f.Write(original)
+			log.Printf("NO CHANGES MADE TO %v. Generating API errored with %v", apiFile, r)
+			var errfmt string
+			for k := range errs {
+				errfmt += "\n"
+				errfmt += k
+			}
+			log.Printf("Errors:%v", errfmt)
+
+		}
+	}(original, f)
+
 	f.WriteString(header)
 	generateAPI(f, gss)
 }
 
 func generateContextFile(gss []*GoSignature) {
+	var original []byte
+	if _, err := os.Stat(ctxFile); err == nil {
+		if original, err = ioutil.ReadFile(ctxFile); err != nil {
+			panic(err)
+		}
+	}
+
 	g, err := os.Create(ctxFile)
 	if err != nil {
 		panic(err)
 	}
 	defer g.Close()
+	defer func(original []byte, f *os.File) {
+		if r := recover(); r != nil {
+			f.Truncate(0)
+			f.Seek(0, 0)
+			f.Write(original)
+			log.Printf("NO CHANGES MADE TO %v. Generating Context errored with\n%v", ctxFile, r)
+		}
+	}(original, g)
 	g.WriteString(header)
 	generateContextAPI(g, gss)
 }
 
+func generateResultFile() {
+	g, err := os.Create(resultFile)
+	if err != nil {
+		panic(err)
+	}
+	defer g.Close()
+
+	g.WriteString(resultHeader)
+	generateResultEnums(g)
+}
+
 func main() {
 	// input := strings.NewReader(src)
 	// sigs := Parse(input)
 	sigs := Parse()
+	sigs = filterCSigs(sigs)
+	//	fmt.Printf("Sigs\n%v", sigs)
 
 	var gss []*GoSignature
-	sigs = filterCSigs(sigs)
+
 	for _, sig := range sigs {
 		gs := sig.GoSig()
 		gss = append(gss, gs)
 	}
 
+	//generateResultFile()
 	generateAPIFile(gss)
-	generateContextFile(gss)
+	//generateContextFile(gss)
 
 	var err error
-	filename := apiFile
-	cmd := exec.Command("goimports", "-w", filename)
-	if err = cmd.Run(); err != nil {
-		log.Fatalf("Go imports failed with %v for %q", err, filename)
+	files := []string{
+		apiFile,
+		ctxFile,
+		resultFile,
 	}
 
-	filename = ctxFile
-	cmd = exec.Command("goimports", "-w", filename)
-	if err = cmd.Run(); err != nil {
-		log.Fatalf("Go imports failed with %v for %q", err, filename)
+	for _, filename := range files {
+		cmd := exec.Command("goimports", "-w", filename)
+		if err = cmd.Run(); err != nil {
+			log.Printf("Go imports failed with %v for %q", err, filename)
+		}
 	}
+
 }
diff --git a/cmd/genlib/mappings.go b/cmd/genlib/mappings.go
index 2f47275..4921457 100644
--- a/cmd/genlib/mappings.go
+++ b/cmd/genlib/mappings.go
@@ -9,11 +9,13 @@ var ignoredFunctions = map[string]struct{}{
 	"cuInit":             empty,
 	"cuDriverGetVersion": empty,
 	"cuDeviceGetName":    empty, // wat?
+	"cuDeviceGetUuid":    empty,
 
 	// context stuff
-	"cuCtxCreate":              empty,
-	"cuCtxDestroy":             empty,
-	"cuDevicePrimaryCtxRetain": empty,
+	"cuCtxCreate":                 empty,
+	"cuCtxDestroy":                empty,
+	"cuDevicePrimaryCtxRetain":    empty,
+	"cuCtxResetPersistingL2Cache": empty,
 
 	// pointer/memory/unified addressing stuff
 	"cuPointerGetAttribute":   empty,
@@ -28,11 +30,14 @@ var ignoredFunctions = map[string]struct{}{
 	"cuMemGetAddressRange":    empty,
 
 	// dealing with voids and strings...
-	"cuLaunchKernel":      empty,
-	"cuModuleLoad":        empty, // dealing with strings
-	"cuModuleLoadData":    empty, // dealing with strings
-	"cuModuleGetFunction": empty, // dealing with strings
-	"cuModuleGetGlobal":   empty, // dealing with strings
+	"cuLaunchKernel":                       empty,
+	"cuLaunchCooperativeKernel":            empty, // TODO
+	"cuLaunchCooperativeKernelMultiDevice": empty, // TODO - possibly never (no bandwidth)
+	"cuLaunchHostFunc":                     empty, // TODO - possibly never, given the intricacies of calling Go functions in C.
+	"cuModuleLoad":                         empty, // dealing with strings
+	"cuModuleLoadData":                     empty, // dealing with strings
+	"cuModuleGetFunction":                  empty, // dealing with strings
+	"cuModuleGetGlobal":                    empty, // dealing with strings
 
 	// event stuff
 	"cuEventCreate":  empty,
@@ -51,6 +56,11 @@ var ignoredFunctions = map[string]struct{}{
 	"cuOccupancyMaxActiveBlocksPerMultiprocessor":          empty,
 	"cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags": empty,
 
+	// graph stuff
+	"cuGraphCreate":  empty,
+	"cuGraphDestroy": empty,
+	"cuGraphClone":   empty,
+
 	/* SUPPORT PLANNED BUT NOT YET DONE */
 	// memory stuff
 	"cuMemAllocHost":            empty, // use C.malloc
@@ -71,6 +81,9 @@ var ignoredFunctions = map[string]struct{}{
 
 	/* Not planning to support anytime soon as these require extra attention */
 
+	// NvSciSync
+	"cuDeviceGetNvSciSyncAttributes": empty, // I have no idea what NvSciSync is.
+
 	// Stream Batching
 	"cuStreamBatchMemOp": empty,
 
@@ -120,6 +133,7 @@ var ignoredFunctions = map[string]struct{}{
 
 	// I have no clue what this is
 	"cuGetExportTable": empty,
+	"cuFuncGetModule":  empty, // NOT IN DOCS
 
 	// Deprecated from CUDA 8 API:
 	"cuDeviceGetProperties":     empty,
@@ -138,6 +152,70 @@ var ignoredFunctions = map[string]struct{}{
 	"cuParamSetTexRef":          empty,
 	"cuTexRefCreate":            empty,
 	"cuTexRefDestroy":           empty,
+
+	// virtual memory stuff - TODO because I don't have time right now
+	"cuMemAddressFree":                       empty,
+	"cuMemAddressReserve":                    empty,
+	"cuMemCreate":                            empty,
+	"cuMemExportToShareableHandle":           empty,
+	"cuMemGetAccess":                         empty,
+	"cuMemGetAllocationGranularity":          empty,
+	"cuMemGetAllocationPropertiesFromHandle": empty,
+	"cuMemImportFromShareableHandle":         empty,
+	"cuMemMap":                               empty,
+	"cuMemRelease":                           empty,
+	"cuMemRetainAllocationHandle":            empty,
+	"cuMemSetAccess":                         empty,
+	"cuMemUnmap":                             empty,
+
+	// External resource interop - UNSUPPORTED SO FAR
+	"cuDestroyExternalMemory":                 empty,
+	"cuDestroyExternalSemaphore":              empty,
+	"cuExternalMemoryGetMappedBuffer":         empty,
+	"cuExternalMemoryGetMappedMipmappedArray": empty,
+	"cuImportExternalMemory":                  empty,
+	"cuImportExternalSemaphore":               empty,
+	"cuSignalExternalSemaphoresAsync":         empty,
+	"cuWaitExternalSemaphoresAsync":           empty,
+
+	// TEMP TODO
+	"cuGraphAddChildGraphNode":                empty,
+	"cuGraphAddDependencies":                  empty,
+	"cuGraphAddEmptyNode":                     empty,
+	"cuGraphAddHostNode":                      empty,
+	"cuGraphAddKernelNode":                    empty,
+	"cuGraphAddMemcpyNode":                    empty,
+	"cuGraphAddMemsetNode":                    empty,
+	"cuGraphChildGraphNodeGetGraph":           empty,
+	"cuGraphDestroyNode":                      empty,
+	"cuGraphExecDestroy":                      empty,
+	"cuGraphExecHostNodeSetParams":            empty,
+	"cuGraphExecKernelNodeSetParams":          empty,
+	"cuGraphExecMemcpyNodeSetParams":          empty,
+	"cuGraphExecMemsetNodeSetParams":          empty,
+	"cuGraphExecUpdate":                       empty,
+	"cuGraphGetEdges":                         empty,
+	"cuGraphGetNodes":                         empty,
+	"cuGraphGetRootNodes":                     empty,
+	"cuGraphHostNodeGetParams":                empty,
+	"cuGraphHostNodeSetParams":                empty,
+	"cuGraphInstantiate":                      empty,
+	"cuGraphKernelNodeCopyAttributes":         empty,
+	"cuGraphKernelNodeGetAttribute":           empty,
+	"cuGraphKernelNodeGetParams":              empty,
+	"cuGraphKernelNodeSetAttribute":           empty,
+	"cuGraphKernelNodeSetParams":              empty,
+	"cuGraphLaunch":                           empty,
+	"cuGraphMemcpyNodeGetParams":              empty,
+	"cuGraphMemcpyNodeSetParams":              empty,
+	"cuGraphMemsetNodeGetParams":              empty,
+	"cuGraphMemsetNodeSetParams":              empty,
+	"cuGraphNodeFindInClone":                  empty,
+	"cuGraphNodeGetDependencies":              empty,
+	"cuGraphNodeGetDependentNodes":            empty,
+	"cuGraphNodeGetType":                      empty,
+	"cuGraphRemoveDependencies":               empty,
+	"cuOccupancyAvailableDynamicSMemPerBlock": empty,
 }
 
 var fnNameMap = map[string]string{
@@ -174,7 +252,7 @@ var fnNameMap = map[string]string{
 	"cuModuleGetGlobal":   "Module Global",
 	"cuModuleGetFunction": "Module Function",
 
-	"cuModuleUnload": "Unload",
+	"cuModuleUnload": "Module Unload",
 
 	"cuMemGetInfo":              "MemInfo",
 	"cuMemAlloc":                "MemAlloc",
@@ -226,32 +304,44 @@ var fnNameMap = map[string]string{
 
 	"cuArrayCreate":          "MakeArray",
 	"cuArrayGetDescriptor":   "Array Descriptor",
-	"cuArrayDestroy":         "DestroyArray",
+	"cuArrayDestroy":         "Array Destroy",
 	"cuArray3DCreate":        "Make3DArray",
 	"cuArray3DGetDescriptor": "Array Descriptor3",
 
-	"cuStreamCreate":             "MakeStream",
-	"cuStreamCreateWithPriority": "MakeStreamWithPriority",
-	"cuStreamGetPriority":        "Stream Priority",
-	"cuStreamGetFlags":           "Stream Flags",
-	"cuStreamWaitEvent":          "Stream Wait",
-	"cuStreamAddCallback":        "Stream AddCallback",
-	"cuStreamAttachMemAsync":     "Stream AttachMemAsync",
-	"cuStreamQuery":              "Stream Query",
-	"cuStreamSynchronize":        "Stream Synchronize",
-	"cuStreamDestroy":            "DestroyStream",
+	"cuStreamCreate":                    "MakeStream",
+	"cuStreamCreateWithPriority":        "MakeStreamWithPriority",
+	"cuStreamGetPriority":               "Stream Priority",
+	"cuStreamGetFlags":                  "Stream Flags",
+	"cuStreamWaitEvent":                 "Stream Wait",
+	"cuStreamAddCallback":               "Stream AddCallback",
+	"cuStreamAttachMemAsync":            "Stream AttachMemAsync",
+	"cuStreamQuery":                     "Stream Query",
+	"cuStreamSynchronize":               "Stream Synchronize",
+	"cuStreamDestroy":                   "Stream Destroy",
+	"cuStreamBeginCapture":              "Stream BeginCapture",
+	"cuStreamCopyAttributes":            "Stream CopyAttributes",
+	"cuStreamEndCapture":                "Stream EndCapture",
+	"cuStreamGetAttribute":              "Stream Attribute",
+	"cuStreamGetCaptureInfo":            "Stream CaptureInfo",
+	"cuStreamGetCtx":                    "Stream Context",
+	"cuStreamIsCapturing":               "Stream IsCapturing",
+	"cuStreamSetAttribute":              "Stream SetAttribute",
+	"cuStreamWaitValue64":               "Stream WaitOnValue64",
+	"cuStreamWriteValue64":              "Stream WriteValue64",
+	"cuThreadExchangeStreamCaptureMode": "ExchangeStreamCaptureThreads", // TODO - possibly manual write
 
 	"cuEventCreate":        "MakeEvent",
 	"cuEventRecord":        "Event Record",
 	"cuEventQuery":         "Event Query",
 	"cuEventSynchronize":   "Event Synchronize",
-	"cuEventDestroy":       "DestroyEvent",
+	"cuEventDestroy":       "Event Destroy",
 	"cuEventElapsedTime":   "Event Elapsed", // getter
 	"cuStreamWaitValue32":  "Stream WaitOnValue32",
 	"cuStreamWriteValue32": "Stream WriteValue32",
 	"cuStreamBatchMemOp":   "Stream BatchMemOp",
 
 	"cuFuncGetAttribute":       "Function Attribute",
+	"cuFuncSetAttribute":       "Function SetAttribute",
 	"cuFuncSetCacheConfig":     "Function SetCacheConfig",
 	"cuFuncSetSharedMemConfig": "Function SetSharedMemConfig",
 
@@ -347,7 +437,12 @@ var ctypes2GoTypes = map[string]string{
 	"C.CUfilter_mode":           "FilterMode",
 	"C.CUdevice_P2PAttribute":   "P2PAttribute",
 
+	"C.CUgraph":         "Graph",
+	"C.CUgraphExec":     "ExecGraph",
+	"C.CUgraphNodeType": "Node",
+
 	"C.cuuint32_t": "uint32",
+	"C.cuuint64_t": "uint64",
 
 	"C.uint":   "uint",
 	"C.uchar":  "byte",
@@ -359,9 +454,10 @@ var ctypes2GoTypes = map[string]string{
 	"C.void":   "unsafe.Pointer",
 	"C.void*":  "*unsafe.Pointer",
 
-	"C.unsigned":       "uint",
-	"C.unsigned char":  "byte",
-	"C.unsigned short": "uint16",
+	"C.unsigned":           "uint",
+	"C.unsigned char":      "byte",
+	"C.unsigned short":     "uint16",
+	"C.unsigned long long": "uint64",
 }
 
 var gotypesConversion = map[string]string{
@@ -384,6 +480,10 @@ var gotypesConversion = map[string]string{
 	"ArrayDesc":         "%s.c()",
 	"Array3Desc":        "%s.c()",
 
+	"Graph":     "%s.c()",
+	"ExecGraph": "%s.c()",
+	"Node":      "%s.c()",
+
 	// flags, which are mostly uint in the C signature
 	"Format":          "C.CUarray_format(%s)",
 	"FuncCacheConfig": "C.CUfunc_cache(%s)",
@@ -401,6 +501,7 @@ var gotypesConversion = map[string]string{
 	"byte":            "C.uchar(%s)",
 	"uint16":          "C.ushort(%s)",
 	"uint32":          "C.cuuint32_t(%s)", // there is only one uint32
+	"uint64":          "C.cuuint64_t(%s)", // there are two uint64s, but both works because C.
 	"int":             "C.int(%s)",
 	"int64":           "C.size_t(%s)",
 	"float64":         "C.float(%s)", // there is only one instance of float64
@@ -409,8 +510,8 @@ var gotypesConversion = map[string]string{
 }
 
 var ctypesConversion = map[string]string{
-	"C.CUstream":                "Stream(uintptr(unsafe.Pointer(%s)))",
-	"C.CUevent":                 "Event(uintptr(unsafe.Pointer(%s)))",
+	//"C.CUstream":                "Stream(uintptr(unsafe.Pointer(%s)))",
+	//"C.CUevent":                 "Event(uintptr(unsafe.Pointer(%s)))",
 	"C.CUDA_ARRAY_DESCRIPTOR":   "goArrayDesc(&%s)",
 	"C.CUDA_ARRAY3D_DESCRIPTOR": "goArray3Desc(&%s)",
 	"C.CUarray":                 "goArray(&%s)",
diff --git a/cmd/genlib/parser.go b/cmd/genlib/parser.go
index 77ba3a2..18890f7 100644
--- a/cmd/genlib/parser.go
+++ b/cmd/genlib/parser.go
@@ -3,8 +3,8 @@ package main
 import (
 	"strings"
 
-	"github.com/cznic/cc"
 	"github.com/gorgonia/bindgen"
+	"modernc.org/cc"
 )
 
 func Parse() (retVal []*CSignature) {
@@ -49,6 +49,7 @@ func decl2csig(d *bindgen.CSignature) *CSignature {
 		params = append(params, bgparam2param(p))
 	}
 	retVal.Params = params
+	retVal.Fix()
 	return retVal
 }
 
diff --git a/cmd/genlib/signature.go b/cmd/genlib/signature.go
index 5707159..bdf590d 100644
--- a/cmd/genlib/signature.go
+++ b/cmd/genlib/signature.go
@@ -79,7 +79,8 @@ func (sig *CSignature) Fix() {
 func (sig *CSignature) GoSig() *GoSignature {
 	name, ok := fnNameMap[sig.Name]
 	if !ok {
-		panic(fmt.Sprintf("Name %q not found in mapping", sig.Name))
+		err := fmt.Sprintf("Name %q not found in mapping", sig.Name)
+		errs[err] = struct{}{}
 	}
 
 	name, receiver := splitReceiver(name)
@@ -99,7 +100,9 @@ func (sig *CSignature) GoSig() *GoSignature {
 			}
 		}
 		if receiverParam.Name == "" {
-			panic(fmt.Sprintf("Not found for receiver %q", receiver))
+			err := fmt.Sprintf("Receiver %q not found in signature of %v", receiver, sig.Name)
+			errs[err] = struct{}{}
+			return nil
 		}
 	}
 
@@ -124,8 +127,9 @@ func (sig *CSignature) GoSig() *GoSignature {
 			var ok bool
 			gp.Name = p.Name
 			if gp.Type, ok = goTypeFromCtype(p.Type); !ok {
-				log.Printf("p.Name %q %v", p.Name, p.Type)
-				panic(fmt.Sprintf("ctype %q has no Go equivalent. Signature: %v", p.Type, sig))
+				err := fmt.Sprintf("ctype %q has no Go equivalent.", p.Type)
+				errs[err] = struct{}{}
+				continue
 			}
 		}
 
@@ -219,7 +223,8 @@ func flagType(name string) string {
 	default:
 		log.Printf("Unreachable flagtype %v", name)
 	}
-	panic("Unreachable")
+	// panic("Unreachable")
+	return "UNKNOWN"
 }
 
 func goTypeFromCtype(ct string) (string, bool) {
diff --git a/cmd/genlib/templates.go b/cmd/genlib/templates.go
index 39dcc2b..dad14eb 100644
--- a/cmd/genlib/templates.go
+++ b/cmd/genlib/templates.go
@@ -8,4 +8,34 @@ import "C"
 // This file was generated by the genlib program. DO NOT EDIT
 
 
+`
+
+const resultHeader = `package cu
+
+//#include <cuda.h>
+import "C"
+import "fmt"
+
+// This file was generated by the genlib program. DO NOT EDIT
+
+// cuResult is the Go version of CUresult:
+// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc6c391505e117393cc2558fff6bfc2e9
+type cuResult int
+
+func (err cuResult) Error() string  { return err.String() }
+func (err cuResult) String() string {
+    if msg, ok := resString[err]; ok {
+        return msg
+    }
+    return fmt.Sprintf("UnknownErrorCode:%d", err)
+}
+
+func result(x C.CUresult) error {
+	err := cuResult(x)
+	if err == Success {
+		return nil
+	}
+	return err
+}
+
 `
diff --git a/convenience.go b/convenience.go
index 29db524..389114c 100644
--- a/convenience.go
+++ b/convenience.go
@@ -31,11 +31,6 @@ func (mem DevicePtr) MemSize() uintptr {
 	return uintptr(size)
 }
 
-// Pointer returns the pointer in form of unsafe.pointer. You shouldn't use it though, as the pointer is typically on the device
-func (mem DevicePtr) Pointer() unsafe.Pointer {
-	return unsafe.Pointer(uintptr(mem))
-}
-
 // ComputeCapability returns the compute capability of the device.
 // This method is a convenience method for the deprecated API call cuDeviceComputeCapability.
 func (d Device) ComputeCapability() (major, minor int, err error) {
diff --git a/ctx.go b/ctx.go
index 1bb2012..429fdee 100644
--- a/ctx.go
+++ b/ctx.go
@@ -174,3 +174,7 @@ func (ctx *Ctx) Run(errChan chan error) error {
 func finalizeCtx(ctx *Ctx) { ctx.Close() }
 
 /* Manually Written Methods */
+
+func (ctx *Ctx) ResetL2Cache() {
+	ctx.Do(ctx.CUContext.ResetL2Cache)
+}
diff --git a/cucontext.go b/cucontext.go
index 71b588d..6429891 100644
--- a/cucontext.go
+++ b/cucontext.go
@@ -86,3 +86,14 @@ func (d Device) RetainPrimaryCtx() (primaryContext CUContext, err error) {
 	}
 	return primaryContext, nil
 }
+
+// ResetL2Cache resets all persisting lines in cache to normal status
+// Use only if your device suports it.
+//
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gb529532b5b1aef808295a6d1d18a0823
+func (ctx *CUContext) ResetL2Cache() error {
+	if err := result(C.cuCtxResetPersistingL2Cache()); err != nil {
+		return err
+	}
+	return nil
+}
diff --git a/cucontext_test.go b/cucontext_test.go
index cd71d6d..6a0636a 100644
--- a/cucontext_test.go
+++ b/cucontext_test.go
@@ -55,8 +55,22 @@ func TestCUContext(t *testing.T) {
 	}
 
 	if maj >= 3 {
+		defaultSharedConf, err := SharedMemConfig()
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		var newBankSize SharedConfig
+
+		for _, c := range []SharedConfig{FourByteBankSize, EightByteBankSize} {
+			if c != defaultSharedConf {
+				newBankSize = c
+				break
+			}
+		}
+
 		// shared conf
-		if err := SetSharedMemConfig(EightByteBankSize); err != nil {
+		if err := SetSharedMemConfig(newBankSize); err != nil {
 			t.Fatal(err)
 		}
 
@@ -65,8 +79,12 @@ func TestCUContext(t *testing.T) {
 			t.Fatal(err)
 		}
 
-		if sharedConf != EightByteBankSize {
-			t.Error("Expected sharedMemConf to be EightByteBankSize")
+		if sharedConf != newBankSize && sharedConf != defaultSharedConf {
+			t.Errorf("Expected sharedMemConf to be SharedConfig of %v or %v. Got %v instead", newBankSize, defaultSharedConf, sharedConf)
+		}
+
+		if sharedConf == defaultSharedConf {
+			t.Logf("The graphics card does not have a configurable shared memory banks")
 		}
 
 		// cache config
diff --git a/device.go b/device.go
index cecd885..f2e6c16 100644
--- a/device.go
+++ b/device.go
@@ -5,6 +5,8 @@ import "C"
 import (
 	"fmt"
 	"unsafe"
+
+	"github.com/google/uuid"
 )
 
 // Device is the representation of a CUDA device
@@ -29,6 +31,17 @@ func (d Device) Name() (string, error) {
 	return C.GoString(cstr), nil
 }
 
+// UUID returns the UUID of the device
+//
+// Wrapper over cuDeviceGetUuid: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g987b46b884c101ed5be414ab4d9e60e4
+func (d Device) UUID() (retVal uuid.UUID, err error) {
+	ptr := &retVal
+	if err = result(C.cuDeviceGetUuid((*C.CUuuid)(unsafe.Pointer(ptr)), C.CUdevice(d))); err != nil {
+		return retVal, err
+	}
+	return retVal, nil
+}
+
 // String implementes fmt.Stringer (and runtime.stringer)
 func (d Device) String() string {
 	if d == CPU {
diff --git a/device_test.go b/device_test.go
index 6a8e3f7..834d598 100644
--- a/device_test.go
+++ b/device_test.go
@@ -47,11 +47,16 @@ func TestDevice(t *testing.T) {
 		if err != nil {
 			t.Fatal(err)
 		}
+		uuid, err := d.UUID()
+		if err != nil {
+			t.Fatal(err)
+		}
 
 		fmt.Fprintf(buf, "Device %d\n========\nName      :\t%q\n", d, name)
 		fmt.Fprintf(buf, "Clock Rate:\t%v kHz\n", cr)
 		fmt.Fprintf(buf, "Memory    :\t%v bytes\n", mem)
-		fmt.Fprintf(buf, "Compute   : \t%d.%d\n", maj, min)
+		fmt.Fprintf(buf, "Compute   :\t%d.%d\n", maj, min)
+		fmt.Fprintf(buf, "UUID      :\t%v\n", uuid)
 		t.Log(buf.String())
 
 		buf.Reset()
diff --git a/dnn/INCOMPLETES_REPORT.md b/dnn/INCOMPLETES_REPORT.md
index 6b76c61..1531daa 100644
--- a/dnn/INCOMPLETES_REPORT.md
+++ b/dnn/INCOMPLETES_REPORT.md
@@ -1,102 +1,275 @@
 ## Potential Nils ##
 These functions have a `*T` return value, but a possible null exception error might happen
 
-* `DeriveBNTensorDescriptor`
+* `NewCTCLoss`
 * `NewDropoutWithContext`
 
 ## Unconverted C Functions ##
 
+* `cudnnAdvInferVersionCheck`
+* `cudnnAdvTrainVersionCheck`
+* `cudnnBackendExecute`
+* `cudnnBackendFinalize`
+* `cudnnBackendGetAttribute`
+* `cudnnBackendInitialize`
+* `cudnnBatchNormalizationBackwardEx`
+* `cudnnBatchNormalizationForwardTrainingEx`
+* `cudnnBuildRNNDynamic`
+* `cudnnCTCLoss_v8`
+* `cudnnCnnInferVersionCheck`
+* `cudnnCnnTrainVersionCheck`
 * `cudnnCopyAlgorithmDescriptor`
-* `cudnnCreateAlgorithmDescriptor`
-* `cudnnCreateAlgorithmPerformance`
+* `cudnnCreateCTCLossDescriptor`
 * `cudnnCreateConvolutionDescriptor`
-* `cudnnDestroyAlgorithmDescriptor`
-* `cudnnDestroyAlgorithmPerformance`
-* `cudnnFindConvolutionBackwardDataAlgorithm`
-* `cudnnFindConvolutionBackwardDataAlgorithmEx`
-* `cudnnFindConvolutionBackwardFilterAlgorithm`
-* `cudnnFindConvolutionBackwardFilterAlgorithmEx`
-* `cudnnFindConvolutionForwardAlgorithm`
-* `cudnnFindConvolutionForwardAlgorithmEx`
+* `cudnnCreateFusedOpsPlan`
+* `cudnnCreateRNNDescriptor`
+* `cudnnDeriveNormTensorDescriptor`
+* `cudnnDestroyFusedOpsPlan`
 * `cudnnFindRNNBackwardDataAlgorithmEx`
 * `cudnnFindRNNBackwardWeightsAlgorithmEx`
 * `cudnnFindRNNForwardInferenceAlgorithmEx`
 * `cudnnFindRNNForwardTrainingAlgorithmEx`
+* `cudnnFusedOpsExecute`
 * `cudnnGetActivationDescriptor`
 * `cudnnGetAlgorithmDescriptor`
 * `cudnnGetAlgorithmPerformance`
 * `cudnnGetAlgorithmSpaceSize`
+* `cudnnGetAttnDescriptor`
+* `cudnnGetBatchNormalizationBackwardExWorkspaceSize`
+* `cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize`
+* `cudnnGetBatchNormalizationTrainingExReserveSpaceSize`
 * `cudnnGetCTCLossDescriptor`
+* `cudnnGetCTCLossDescriptorEx`
+* `cudnnGetCTCLossDescriptor_v8`
 * `cudnnGetCTCLossWorkspaceSize`
+* `cudnnGetCTCLossWorkspaceSize_v8`
+* `cudnnGetCallback`
 * `cudnnGetConvolution2dDescriptor`
-* `cudnnGetConvolutionBackwardDataAlgorithm`
 * `cudnnGetConvolutionBackwardDataAlgorithmMaxCount`
 * `cudnnGetConvolutionBackwardDataAlgorithm_v7`
 * `cudnnGetConvolutionBackwardDataWorkspaceSize`
-* `cudnnGetConvolutionBackwardFilterAlgorithm`
 * `cudnnGetConvolutionBackwardFilterAlgorithmMaxCount`
 * `cudnnGetConvolutionBackwardFilterAlgorithm_v7`
 * `cudnnGetConvolutionBackwardFilterWorkspaceSize`
-* `cudnnGetConvolutionForwardAlgorithm`
 * `cudnnGetConvolutionForwardAlgorithmMaxCount`
 * `cudnnGetConvolutionForwardAlgorithm_v7`
 * `cudnnGetConvolutionForwardWorkspaceSize`
 * `cudnnGetConvolutionGroupCount`
 * `cudnnGetConvolutionMathType`
 * `cudnnGetConvolutionNdDescriptor`
+* `cudnnGetConvolutionReorderType`
 * `cudnnGetCudartVersion`
 * `cudnnGetDropoutDescriptor`
 * `cudnnGetErrorString`
 * `cudnnGetFilter4dDescriptor`
 * `cudnnGetFilterNdDescriptor`
+* `cudnnGetFilterSizeInBytes`
+* `cudnnGetFoldedConvBackwardDataDescriptors`
+* `cudnnGetFusedOpsConstParamPackAttribute`
+* `cudnnGetFusedOpsVariantParamPackAttribute`
 * `cudnnGetLRNDescriptor`
+* `cudnnGetMultiHeadAttnBuffers`
+* `cudnnGetMultiHeadAttnWeights`
+* `cudnnGetNormalizationBackwardWorkspaceSize`
+* `cudnnGetNormalizationForwardTrainingWorkspaceSize`
+* `cudnnGetNormalizationTrainingReserveSpaceSize`
 * `cudnnGetOpTensorDescriptor`
 * `cudnnGetPooling2dDescriptor`
 * `cudnnGetPoolingNdDescriptor`
 * `cudnnGetProperty`
 * `cudnnGetRNNBackwardDataAlgorithmMaxCount`
 * `cudnnGetRNNBackwardWeightsAlgorithmMaxCount`
-* `cudnnGetRNNDescriptor`
+* `cudnnGetRNNBiasMode`
+* `cudnnGetRNNDataDescriptor`
+* `cudnnGetRNNDescriptor_v6`
+* `cudnnGetRNNDescriptor_v8`
 * `cudnnGetRNNForwardInferenceAlgorithmMaxCount`
 * `cudnnGetRNNForwardTrainingAlgorithmMaxCount`
-* `cudnnGetRNNLinLayerBiasParams`
-* `cudnnGetRNNLinLayerMatrixParams`
 * `cudnnGetRNNMatrixMathType`
+* `cudnnGetRNNPaddingMode`
 * `cudnnGetRNNProjectionLayers`
+* `cudnnGetRNNTempSpaceSizes`
+* `cudnnGetRNNWeightParams`
+* `cudnnGetRNNWeightSpaceSize`
 * `cudnnGetReduceTensorDescriptor`
+* `cudnnGetSeqDataDescriptor`
 * `cudnnGetStream`
 * `cudnnGetTensor4dDescriptor`
 * `cudnnGetTensorNdDescriptor`
 * `cudnnGetTensorSizeInBytes`
+* `cudnnGetTensorTransformDescriptor`
 * `cudnnGetVersion`
+* `cudnnInitTransformDest`
+* `cudnnMakeFusedOpsPlan`
+* `cudnnMultiHeadAttnBackwardData`
+* `cudnnMultiHeadAttnBackwardWeights`
+* `cudnnMultiHeadAttnForward`
+* `cudnnNormalizationBackward`
+* `cudnnNormalizationForwardInference`
+* `cudnnNormalizationForwardTraining`
+* `cudnnOpsInferVersionCheck`
+* `cudnnOpsTrainVersionCheck`
 * `cudnnQueryRuntimeError`
+* `cudnnRNNBackwardDataEx`
+* `cudnnRNNBackwardData_v8`
+* `cudnnRNNBackwardWeightsEx`
+* `cudnnRNNBackwardWeights_v8`
+* `cudnnRNNForward`
+* `cudnnRNNForwardInferenceEx`
+* `cudnnRNNForwardTrainingEx`
+* `cudnnRNNGetClip`
+* `cudnnRNNGetClip_v8`
+* `cudnnRNNSetClip`
+* `cudnnRNNSetClip_v8`
+* `cudnnReorderFilterAndBias`
 * `cudnnRestoreAlgorithm`
 * `cudnnSaveAlgorithm`
-* `cudnnSetAlgorithmDescriptor`
-* `cudnnSetAlgorithmPerformance`
+* `cudnnSetCTCLossDescriptor`
+* `cudnnSetCTCLossDescriptorEx`
+* `cudnnSetCTCLossDescriptor_v8`
+* `cudnnSetCallback`
 * `cudnnSetConvolution2dDescriptor`
 * `cudnnSetConvolutionGroupCount`
 * `cudnnSetConvolutionMathType`
 * `cudnnSetConvolutionNdDescriptor`
+* `cudnnSetConvolutionReorderType`
 * `cudnnSetRNNAlgorithmDescriptor`
-* `cudnnSetRNNDescriptor_v5`
+* `cudnnSetRNNBiasMode`
 * `cudnnSetRNNDescriptor_v6`
+* `cudnnSetRNNDescriptor_v8`
+* `cudnnSetRNNPaddingMode`
 * `cudnnSetRNNProjectionLayers`
 * `cudnnSetStream`
 * `cudnnSetTensor`
+* `cudnnTransformFilter`
+* `cudnnTransformTensorEx`
 
 ## Unconverted/Unused C Types ##
 
-* `cudnnAlgorithmDescriptor_t`
-* `cudnnAlgorithmPerformance_t`
 * `cudnnAlgorithm_t`
+* `cudnnCallback_t`
 * `cudnnConvolutionBwdDataAlgoPerf_t`
-* `cudnnConvolutionBwdDataPreference_t`
 * `cudnnConvolutionBwdFilterAlgoPerf_t`
-* `cudnnConvolutionBwdFilterPreference_t`
 * `cudnnConvolutionFwdAlgoPerf_t`
-* `cudnnConvolutionFwdPreference_t`
+* `cudnnDebug_t`
+* `cudnnFusedOpsPlan_t`
 * `cudnnRuntimeTag_t`
-* `cudnnSeverity_t`
 * `cudnnStatus_t`
 
+
+# Build Errors/TODO
+```
+# gorgonia.org/cu/dnn
+./algorithm.go:9:2: type struct {} is not an expression
+./generated_API.go:13:54: *Activation is not a type
+./generated_API.go:34:53: *Activation is not a type
+./generated_API.go:147:82: undefined: TODO
+./generated_API.go:214: *Activation is not a type
+./generated_API.go:308:61: cannot use _Ctype_ulong(sizeInBytes) (type _Ctype_ulong) as type *_Ctype_ulong in assignment
+./generated_API.go:312:143: cannot use _Ctype_int(returnedAlgoCount) (type _Ctype_int) as type *_Ctype_int in assignment
+./generated_API.go:316:203: cannot use _Ctype_int(returnedAlgoCount) (type _Ctype_int) as type *_Ctype_int in assignment
+./generated_API.go:320:145: cannot use _Ctype_int(returnedAlgoCount) (type _Ctype_int) as type *_Ctype_int in assignment
+./generated_API.go:324:204: cannot use _Ctype_int(returnedAlgoCount) (type _Ctype_int) as type *_Ctype_int in assignment
+./generated_API.go:328:138: cannot use _Ctype_int(returnedAlgoCount) (type _Ctype_int) as type *_Ctype_int in assignment
+./generated_API.go:328:138: cannot use perfResults.internal (type _Ctype_cudnnConvolutionFwdAlgo_t) as type *_Ctype_struct_cudnnConvolutionFwdAlgoPerfStruct in assignment
+./generated_API.go:334:181: undefined: TODO
+./generated_API.go:332:195: cannot use _Ctype_int(returnedAlgoCount) (type _Ctype_int) as type *_Ctype_int in assignment
+./generated_API.go:332:195: cannot use perfResults.internal (type _Ctype_cudnnConvolutionFwdAlgo_t) as type *_Ctype_struct_cudnnConvolutionFwdAlgoPerfStruct in assignment
+./generated_API.go:338:181: undefined: TODO
+./generated_API.go:344:91: cannot use _Ctype_ulong(sizeInBytes) (type _Ctype_ulong) as type *_Ctype_ulong in assignment
+./generated_API.go:348:118: cannot use _Ctype_ulong(sizeInBytes) (type _Ctype_ulong) as type *_Ctype_ulong in assignment
+./generated_API.go:348:156: cannot use _cgo3 (type _Ctype_cudnnTensorDescriptor_t) as type *_Ctype_cudnnTensorDescriptor_t in argument to _Cfunc_cudnnGetRNNTrainingReserveSize
+./generated_API.go:352:112: cannot use _Ctype_ulong(sizeInBytes) (type _Ctype_ulong) as type *_Ctype_ulong in assignment
+./generated_API.go:352:150: cannot use _cgo3 (type _Ctype_cudnnTensorDescriptor_t) as type *_Ctype_cudnnTensorDescriptor_t in argument to _Cfunc_cudnnGetRNNWorkspaceSize
+./generated_API.go:356:123: cannot use _Ctype_ulong(sizeInBytes) (type _Ctype_ulong) as type *_Ctype_ulong in assignment
+./generated_API.go:360:125: cannot use _Ctype_ulong(sizeInBytes) (type _Ctype_ulong) as type *_Ctype_ulong in assignment
+./generated_API.go:477: cannot use _cgo3 (type _Ctype_cudnnTensorDescriptor_t) as type *_Ctype_cudnnTensorDescriptor_t in argument to _Cfunc_cudnnRNNBackwardData
+./generated_API.go:477: cannot use _cgo5 (type _Ctype_cudnnTensorDescriptor_t) as type *_Ctype_cudnnTensorDescriptor_t in argument to _Cfunc_cudnnRNNBackwardData
+./generated_API.go:477: cannot use _cgo17 (type _Ctype_cudnnTensorDescriptor_t) as type *_Ctype_cudnnTensorDescriptor_t in argument to _Cfunc_cudnnRNNBackwardData
+./generated_API.go:481: cannot use _cgo3 (type _Ctype_cudnnTensorDescriptor_t) as type *_Ctype_cudnnTensorDescriptor_t in argument to _Cfunc_cudnnRNNBackwardWeights
+./generated_API.go:481: cannot use _cgo7 (type _Ctype_cudnnTensorDescriptor_t) as type *_Ctype_cudnnTensorDescriptor_t in argument to _Cfunc_cudnnRNNBackwardWeights
+./generated_API.go:485: cannot use _cgo3 (type _Ctype_cudnnTensorDescriptor_t) as type *_Ctype_cudnnTensorDescriptor_t in argument to _Cfunc_cudnnRNNForwardInference
+./generated_API.go:485: cannot use _cgo11 (type _Ctype_cudnnTensorDescriptor_t) as type *_Ctype_cudnnTensorDescriptor_t in argument to _Cfunc_cudnnRNNForwardInference
+./generated_API.go:489: cannot use _cgo3 (type _Ctype_cudnnTensorDescriptor_t) as type *_Ctype_cudnnTensorDescriptor_t in argument to _Cfunc_cudnnRNNForwardTraining
+./generated_API.go:489: cannot use _cgo11 (type _Ctype_cudnnTensorDescriptor_t) as type *_Ctype_cudnnTensorDescriptor_t in argument to _Cfunc_cudnnRNNForwardTraining
+./generated_API.go:645:61: undefined: xDesc
+./generated_API.go:649:67: cannot use _Ctype_ulong(sizeInBytes) (type _Ctype_ulong) as type *_Ctype_ulong in assignment
+./generated_algorithmdescriptor.go:13:12: undefined: TODO
+./generated_algorithmdescriptor.go:17:39: undefined: TODO
+./generated_algorithmperformance.go:14:11: undefined: Status
+./generated_algorithmperformance.go:20:68: undefined: Status
+./generated_algorithmperformance.go:22:52: not enough arguments in call to _Cfunc_cudnnCreateAlgorithmPerformance
+	have (*_Ctype_cudnnAlgorithmPerformance_t)
+	want (*_Ctype_cudnnAlgorithmPerformance_t, _Ctype_int)
+./generated_algorithmperformance.go:42:76: a.algoPerf undefined (type *AlgorithmPerformance has no field or method algoPerf)
+./generated_algorithmperformance.go:48:41: undefined: Status
+./generated_algorithmperformance.go:57:36: not enough arguments in call to _Cfunc_cudnnDestroyAlgorithmPerformance
+	have (_Ctype_cudnnAlgorithmPerformance_t)
+	want (*_Ctype_cudnnAlgorithmPerformance_t, _Ctype_int)
+./generated_backend.go:25:49: not enough arguments in call to _Cfunc_cudnnBackendCreateDescriptor
+	have (*_Ctype_cudnnBackendDescriptor_t)
+	want (_Ctype_cudnnBackendDescriptorType_t, *_Ctype_cudnnBackendDescriptor_t)
+./generated_enums.go:259:2: Activation redeclared in this block
+	previous declaration at ./generated_activation.go:10:6
+./generated_enums.go:563:2: Activation redeclared in this block
+	previous declaration at ./generated_enums.go:259:31
+./generated_enums.go:574:2: Activation redeclared in this block
+	previous declaration at ./generated_enums.go:563:24
+./generated_enums.go:575:2: AddActivation redeclared in this block
+	previous declaration at ./generated_enums.go:260:31
+./generated_enums.go:595:6: PointwiseMode redeclared in this block
+	previous declaration at ./generated_enums.go:25:63
+./generated_enums.go:598:2: Add redeclared in this block
+	previous declaration at ./generated_enums.go:584:20
+./generated_enums.go:599:2: Mul redeclared in this block
+	previous declaration at ./generated_enums.go:585:20
+./generated_enums.go:600:2: Min redeclared in this block
+	previous declaration at ./generated_enums.go:586:20
+./generated_enums.go:601:2: Max redeclared in this block
+	previous declaration at ./generated_enums.go:587:20
+./generated_enums.go:602:2: Sqrt redeclared in this block
+	previous declaration at ./generated_enums.go:588:20
+./generated_enums.go:627:2: Standard redeclared in this block
+	previous declaration at ./generated_enums.go:553:22
+./generated_enums.go:630:2: Count redeclared in this block
+	previous declaration at ./generated_enums.go:221:38
+./generated_enums.go:651:2: None redeclared in this block
+	previous declaration at ./generated_enums.go:501:34
+./generated_enums.go:776:2: Channel redeclared in this block
+	previous declaration at ./generated_enums.go:564:24
+./generated_enums.go:796:2: Add redeclared in this block
+	previous declaration at ./generated_enums.go:598:29
+./generated_enums_strings.go:20:2: type PointwiseMode is not an expression
+./generated_enums_strings.go:195:2: cannot use Count (type RNNAlgo) as type BackendLayoutType in map key
+./generated_enums_strings.go:222:2: cannot use Activation (type NormOps) as type BatchNormOps in map key
+./generated_enums_strings.go:223:2: cannot use AddActivation (type NormOps) as type BatchNormOps in map key
+./generated_enums_strings.go:410:2: cannot use None (type RNNClipMode) as type LossNormalizationMode in map key
+./generated_enums_strings.go:446:2: cannot use Standard (type RNNAlgo) as type NormAlgo in map key
+./generated_enums_strings.go:453:2: cannot use Activation (type NormOps) as type NormMode in map key
+./generated_enums_strings.go:454:2: cannot use Channel (type SoftmaxMode) as type NormMode in map key
+./generated_enums_strings.go:468:2: cannot use Add (type WgradMode) as type OpTensorOp in map key
+./generated_enums_strings.go:469:2: cannot use Mul (type PointwiseMode) as type OpTensorOp in map key
+./generated_enums_strings.go:470:2: cannot use Min (type PointwiseMode) as type OpTensorOp in map key
+./generated_enums_strings.go:471:2: cannot use Max (type PointwiseMode) as type OpTensorOp in map key
+./generated_enums_strings.go:472:2: cannot use Sqrt (type PointwiseMode) as type OpTensorOp in map key
+./generated_fusedopconsts.go:20:54: not enough arguments in call to _Cfunc_cudnnCreateFusedOpsConstParamPack
+	have (*_Ctype_cudnnFusedOpsConstParamPack_t)
+	want (*_Ctype_cudnnFusedOpsConstParamPack_t, _Ctype_cudnnFusedOps_t)
+./generated_fusedopconsts.go:24:92: param.Pointer undefined (type Memory has no field or method Pointer)
+./generated_fusedopvariantparams.go:23:56: not enough arguments in call to _Cfunc_cudnnCreateFusedOpsVariantParamPack
+	have (*_Ctype_cudnnFusedOpsVariantParamPack_t)
+	want (*_Ctype_cudnnFusedOpsVariantParamPack_t, _Ctype_cudnnFusedOps_t)
+./generated_fusedopvariantparams.go:27:107: ptr.Pointer undefined (type Memory has no field or method Pointer)
+./generated_seqdata.go:38:97: cannot use axes.C() (type _Ctype_cudnnSeqDataAxis_t) as type *_Ctype_cudnnSeqDataAxis_t in assignment
+./generated_tensortransform.go:51:77: cannot use t.internal (type _Ctype_cudnnTensorTransformDescriptor_t) as type *_Ctype_cudnnTensorTransformDescriptor_t in return argument
+./optensor.go:52:24: *TensorDescriptor is not a type
+./optensor.go:53:24: *TensorDescriptor is not a type
+./optensor.go:54:22: *TensorDescriptor is not a type
+./optensor.go:106:26: cData.Pointer undefined (type Memory has no field or method Pointer)
+./optensor.go:104:33: aData.Pointer undefined (type Memory has no field or method Pointer)
+./optensor.go:105:28: bData.Pointer undefined (type Memory has no field or method Pointer)
+./pooling.go:95:37: *TensorDescriptor is not a type
+./pooling.go:103:41: *TensorDescriptor is not a type
+./tensor.go:11:6: TensorDescriptor redeclared in this block
+	previous declaration at ./generated_enums.go:163:71
+```
diff --git a/dnn/algorithm.c b/dnn/algorithm.c
new file mode 100644
index 0000000..0f2b1a7
--- /dev/null
+++ b/dnn/algorithm.c
@@ -0,0 +1,34 @@
+#include <cudnn.h>
+
+/* THIS IS TEMPORARY, UNTIL I FIGURE OUT A BETTER WAY TO SET UNION VALUES IN GO */
+
+cudnnAlgorithm_t makeConvFwdAlgo(cudnnConvolutionFwdAlgo_t algo) {
+	cudnnAlgorithm_t retVal;
+	retVal.algo.convFwdAlgo = algo;
+	return retVal;
+}
+
+cudnnAlgorithm_t makeConvBwdFilterAlgo(cudnnConvolutionBwdFilterAlgo_t algo){
+	cudnnAlgorithm_t retVal;
+	retVal.algo.convBwdFilterAlgo = algo;
+	return retVal;
+}
+
+
+cudnnAlgorithm_t makeConvBwdDataAlgo(cudnnConvolutionBwdDataAlgo_t algo){
+	cudnnAlgorithm_t retVal;
+	retVal.algo.convBwdDataAlgo = algo;
+	return retVal;
+}
+
+cudnnAlgorithm_t makeRNNAlgo(cudnnRNNAlgo_t algo) {
+       	cudnnAlgorithm_t retVal;
+	retVal.algo.RNNAlgo = algo;
+	return retVal;
+}
+
+cudnnAlgorithm_t makeCTCLossAlgo(cudnnCTCLossAlgo_t algo) {
+	cudnnAlgorithm_t retVal;
+	retVal.algo.CTCLossAlgo = algo;
+	return retVal;
+}
diff --git a/dnn/algorithm.h b/dnn/algorithm.h
new file mode 100644
index 0000000..674c207
--- /dev/null
+++ b/dnn/algorithm.h
@@ -0,0 +1,5 @@
+extern cudnnAlgorithm_t makeConvFwdAlgo(cudnnConvolutionFwdAlgo_t algo);
+extern cudnnAlgorithm_t makeConvBwdFilterAlgo(cudnnConvolutionBwdFilterAlgo_t algo);
+extern cudnnAlgorithm_t makeConvBwdDataAlgo(cudnnConvolutionBwdDataAlgo_t algo);
+extern cudnnAlgorithm_t makeRNNAlgo(cudnnRNNAlgo_t algo);
+extern cudnnAlgorithm_t makeCTCLossAlgo(cudnnCTCLossAlgo_t algo);
diff --git a/dnn/cgoflags.go b/dnn/cgoflags.go
index f6db7ad..67b718e 100644
--- a/dnn/cgoflags.go
+++ b/dnn/cgoflags.go
@@ -5,5 +5,10 @@ package cudnn
 //
 // // default locs:
 // #cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib
-// #cgo CFLAGS: -I/usr/include/x86_64-linux-gnu -I/usr/local/cuda-10.1/targets/x86_64-linux/include -I/usr/local/cuda/include
+//
+// // Include locations for cudnn.
+// #cgo CFLAGS: -I/usr/local/cuda-10.2/targets/x86_64-linux/include
+// #cgo CFLAGS: -I/usr/local/cuda-10.1/targets/x86_64-linux/include
+// #cgo CFLAGS: -I/usr/include/x86_64-linux-gnu
+// #cgo CFLAGS: -I/usr/local/cuda/include
 import "C"
diff --git a/dnn/convolution.go b/dnn/convolution.go
index 02b05ad..1c9bfb7 100644
--- a/dnn/convolution.go
+++ b/dnn/convolution.go
@@ -228,7 +228,7 @@ func destroyConvolution(obj *Convolution) { C.cudnnDestroyConvolutionDescriptor(
 
 // TODO
 type ConvolutionFwdPerf struct {
-	internal    C.cudnnConvolutionFwdAlgo_t
+	internal    C.cudnnConvolutionFwdAlgoPerf_t
 	Algo        ConvolutionFwdAlgo
 	Time        float64
 	Memory      uintptr // size
@@ -243,7 +243,7 @@ func convolutionFwdPerfFromC(p C.cudnnConvolutionFwdAlgo_t) *ConvolutionFwdPerf
 }
 
 type ConvolutionBwdPerf struct {
-	internal *C.cudnnConvolutionBwdFilterAlgoPerf_t
+	internal C.cudnnConvolutionBwdFilterAlgoPerf_t
 	Err      error
 
 	Algo        ConvolutionBwdFilterAlgo
@@ -259,7 +259,7 @@ func convolutionBwdPerfFromC(p C.cudnnConvolutionBwdFilterAlgoPerf_t) *Convoluti
 }
 
 type ConvolutionBwdDataPerf struct {
-	internal *C.cudnnConvolutionBwdDataAlgoPerf_t
+	internal C.cudnnConvolutionBwdDataAlgoPerf_t
 	Algo     ConvolutionBwdDataAlgo
 	Err      error
 
diff --git a/dnn/cudnn.go b/dnn/cudnn.go
index 634c213..dc02b73 100644
--- a/dnn/cudnn.go
+++ b/dnn/cudnn.go
@@ -1,5 +1,6 @@
 package cudnn
 
+// #include <stdint.h>
 import "C"
 import (
 	"fmt"
@@ -14,10 +15,13 @@ func init() {
 	gointsize = int(unsafe.Sizeof(int(1)))
 }
 
+// int32sPool is a pool of Go accessible []int32.
+// The internal pool is necessary for setting a bunch of values (usually shapes)
 var int32sPool = &sync.Pool{
 	New: func() interface{} { return make([]int32, 0, 8) },
 }
 
+// returnManaged returns any managed slices to the pool.
 func returnManaged(a interface{}) {
 	if a == nil {
 		return
@@ -33,6 +37,10 @@ func returnManaged(a interface{}) {
 	}
 }
 
+// ints2CIntPtr takes a []int and returns a C pointer to the slice.
+// On architectures where the Go int and the C int sizes are different,
+// a slice of C-int-size-equivalent ints will be allocated. This is called "managed".
+// The C pointer will be to that newly allocated slice. The `managed` slice will also be returned.
 func ints2CIntPtr(a []int) (cPtr *C.int, managed interface{}) {
 	if cintsize == gointsize {
 		return (*C.int)(unsafe.Pointer(&a[0])), nil
@@ -48,3 +56,11 @@ func ints2CIntPtr(a []int) (cPtr *C.int, managed interface{}) {
 		panic(fmt.Sprintf("UNHANDLED: cintsize: %v gointsize: %v", cintsize, gointsize))
 	}
 }
+
+func int32s2CInt32Ptr(a []int32) (cPtr *C.int32_t) {
+	return (*C.int32_t)(unsafe.Pointer(&a[0]))
+}
+
+func uint32s2CUint32Ptr(a []uint32) (cPtr *C.uint32_t) {
+	return (*C.uint32_t)(unsafe.Pointer(&a[0]))
+}
diff --git a/dnn/dropout.go b/dnn/dropout.go
index ae1a3b7..617a000 100644
--- a/dnn/dropout.go
+++ b/dnn/dropout.go
@@ -55,45 +55,45 @@ func NewDropoutWithContext(dropout float64, handle *Context, states Memory, stat
 }
 
 // Use is the second stage of the two-stage API.
-func (d *Dropout) Use(ctx *Context, states Memory, stateSizeInBytes uintptr, seed uint64) error {
-	d.handle = ctx
-	d.states = states
-	d.stateSizeInBytes = stateSizeInBytes
-	d.seed = seed
+func (dr *Dropout) Use(ctx *Context, states Memory, stateSizeInBytes uintptr, seed uint64) error {
+	dr.handle = ctx
+	dr.states = states
+	dr.stateSizeInBytes = stateSizeInBytes
+	dr.seed = seed
 
-	return result(C.cudnnSetDropoutDescriptor(d.internal, d.handle.internal, C.float(d.dropout), unsafe.Pointer(d.states.Uintptr()), C.size_t(d.stateSizeInBytes), C.ulonglong(d.seed)))
+	return result(C.cudnnSetDropoutDescriptor(dr.internal, dr.handle.internal, C.float(dr.dropout), unsafe.Pointer(dr.states.Uintptr()), C.size_t(dr.stateSizeInBytes), C.ulonglong(dr.seed)))
 }
 
 // IsReady indicates if the dropout operator is ready to be used
-func (d *Dropout) IsReady() bool {
-	return d.handle != nil && d.states != nil && d.stateSizeInBytes != 0
+func (dr *Dropout) IsReady() bool {
+	return dr.handle != nil && dr.states != nil && dr.stateSizeInBytes != 0
 }
 
 // Reset resets the state to be not ready. It does NOT reset the dropout ratio.
-func (d *Dropout) Reset() {
-	d.handle = nil
-	d.states = nil
-	d.stateSizeInBytes = 0
-	d.seed = 0
+func (dr *Dropout) Reset() {
+	dr.handle = nil
+	dr.states = nil
+	dr.stateSizeInBytes = 0
+	dr.seed = 0
 }
 
 // Handle returns the internal handle.
-func (d *Dropout) Handle() *Context { return d.handle }
+func (dr *Dropout) Handle() *Context { return dr.handle }
 
 // Dropout returns the internal dropout ratio.
-func (d *Dropout) Dropout() float32 { return d.dropout }
+func (dr *Dropout) Dropout() float32 { return dr.dropout }
 
 // StateSizeInBytes returns the internal stateSizeInBytes.
-func (d *Dropout) StateSizeInBytes() uintptr { return d.stateSizeInBytes }
+func (dr *Dropout) StateSizeInBytes() uintptr { return dr.stateSizeInBytes }
 
 // Seed returns the internal seed.
-func (d *Dropout) Seed() uint64 { return d.seed }
+func (dr *Dropout) Seed() uint64 { return dr.seed }
 
-func (d *Dropout) States() Memory { return d.states }
+func (dr *Dropout) States() Memory { return dr.states }
 
-func (d *Dropout) RequiredStateSize(ctx *Context) (uintptr, error) {
-	if d.reqStateSize > 0 {
-		return d.reqStateSize, nil
+func (dr *Dropout) RequiredStateSize(ctx *Context) (uintptr, error) {
+	if dr.reqStateSize > 0 {
+		return dr.reqStateSize, nil
 	}
 
 	var minSize C.size_t
@@ -101,8 +101,8 @@ func (d *Dropout) RequiredStateSize(ctx *Context) (uintptr, error) {
 		return 0, errors.Wrapf(err, "Unable to get minimum state size")
 	}
 
-	d.reqStateSize = uintptr(minSize)
-	return d.reqStateSize, nil
+	dr.reqStateSize = uintptr(minSize)
+	return dr.reqStateSize, nil
 }
 
 // BUG(anyone): the memory for the scratch space isn't freed. This could potentially lead to some issues
diff --git a/dnn/generated_API.go b/dnn/generated_API.go
index 69229bf..f131180 100644
--- a/dnn/generated_API.go
+++ b/dnn/generated_API.go
@@ -10,23 +10,33 @@ import (
 	"github.com/pkg/errors"
 )
 
-// RestoreDropoutDescriptor restores a dropout descriptor to a previously saved-off state.
-func (dr *Dropout) RestoreDropoutDescriptor(handle *Context, dropout float32, states Memory, stateSizeInBytes uintptr, seed uint64) error {
-	// call cudnnRestoreDropoutDescriptor
-	return result(C.cudnnRestoreDropoutDescriptor(dr.internal, handle.internal, C.float(dropout), states.Pointer(), C.size_t(stateSizeInBytes), C.ulonglong(seed)))
+// Input. Handle to a previously created cuDNN context. For more information, see cudnnHandle_t.
+func (co *Context) ActivationBackward(activationDesc *Activation, alpha float64, yDesc *TensorDescriptor, y Memory, dyDesc *TensorDescriptor, dy Memory, xDesc *TensorDescriptor, x Memory, beta float64, dxDesc *TensorDescriptor, dx Memory) error {
+	// DOUBLECHECK: "cudnnActivationBackward" returns Memory type in Parameter 11
+	var alphaC, betaC unsafe.Pointer
+	switch yDesc.dataType {
+	case Float, Half:
+		var alphaF, betaF C.float
+		alphaF = C.float(float32(alpha))
+		betaF = C.float(float32(beta))
+		alphaC = unsafe.Pointer(&alphaF)
+		betaC = unsafe.Pointer(&betaF)
+	case Double:
+		var alphaF, betaF C.double
+		alphaF = C.double(alpha)
+		betaF = C.double(beta)
+		alphaC = unsafe.Pointer(&alphaF)
+		betaC = unsafe.Pointer(&betaF)
+	default:
+		return errors.Errorf("Unsupported data type: %v", yDesc.dataType)
+	}
+	// call cudnnActivationBackward
+	return result(C.cudnnActivationBackward(co.internal, activationDesc.internal, alphaC, yDesc.internal, unsafe.Pointer(y.Uintptr()), dyDesc.internal, unsafe.Pointer(dy.Uintptr()), xDesc.internal, unsafe.Pointer(x.Uintptr()), betaC, dxDesc.internal, unsafe.Pointer(dx.Uintptr())))
 }
 
-// // Derives a secondary tensor descriptor for BatchNormalization scale, invVariance, bnBias, bnScale subtensors from the layer's x data descriptor. Use the tensor descriptor produced by this function as the bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc parameters in Spatial and Per-Activation Batch Normalization forward and backward functions. Resulting dimensions will be 1xC(x1)x1x1 for BATCHNORM_MODE_SPATIAL and 1xC(xD)xHxW for BATCHNORM_MODE_PER_ACTIVATION (parentheses for 5D). For HALF input data type the resulting tensor descriptor will have a FLOAT type. For other data types it will have the same type as the input data.
-// func (te *TensorDescriptor) DeriveBNTensorDescriptor(mode BatchNormMode) (derivedBnDesc *TensorDescriptor, err error) {
-// 	// TODO: xDesc cudnnTensorDescriptor_t
-// 	// call cudnnDeriveBNTensorDescriptor
-// 	err = result(C.cudnnDeriveBNTensorDescriptor(te.internal, xDesc.internal, mode.C()))
-// 	return
-// }
-
-// TransformTensor copies the scaled data from one tensor to another tensor with a different layout. Those descriptors need to have the same dimensions but not necessarily the same strides. The input and output tensors must not overlap in any way (i.e., tensors cannot be transformed in place). TransformTensor can be used to convert a tensor with an unsupported format to a supported one.
-func (co *Context) TransformTensor(alpha float64, xDesc *TensorDescriptor, x Memory, beta float64, yDesc *TensorDescriptor, y Memory) error {
-	// DOUBLECHECK: "cudnnTransformTensor" returns Memory type in Parameter 6
+// Input. Handle to a previously created cuDNN context. For more information, see cudnnHandle_t.
+func (co *Context) ActivationForward(activationDesc *Activation, alpha float64, xDesc *TensorDescriptor, x Memory, beta float64, yDesc *TensorDescriptor, y Memory) error {
+	// DOUBLECHECK: "cudnnActivationForward" returns Memory type in Parameter 7
 	var alphaC, betaC unsafe.Pointer
 	switch xDesc.dataType {
 	case Float, Half:
@@ -44,8 +54,8 @@ func (co *Context) TransformTensor(alpha float64, xDesc *TensorDescriptor, x Mem
 	default:
 		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
 	}
-	// call cudnnTransformTensor
-	return result(C.cudnnTransformTensor(co.internal, alphaC, xDesc.internal, x.Pointer(), betaC, yDesc.internal, y.Pointer()))
+	// call cudnnActivationForward
+	return result(C.cudnnActivationForward(co.internal, activationDesc.internal, alphaC, xDesc.internal, unsafe.Pointer(x.Uintptr()), betaC, yDesc.internal, unsafe.Pointer(y.Uintptr())))
 }
 
 // AddTensor adds the scaled values of a bias tensor to another tensor. Each dimension of the bias tensor A must match the corresponding dimension of the destination tensor C or must be equal to 1. In the latter case, the same value from the bias tensor for those dimensions will be used to blend into the C tensor.
@@ -69,61 +79,44 @@ func (co *Context) AddTensor(alpha float64, aDesc *TensorDescriptor, A Memory, b
 		return errors.Errorf("Unsupported data type: %v", aDesc.dataType)
 	}
 	// call cudnnAddTensor
-	return result(C.cudnnAddTensor(co.internal, alphaC, aDesc.internal, A.Pointer(), betaC, cDesc.internal, C_.Pointer()))
+	return result(C.cudnnAddTensor(co.internal, alphaC, aDesc.internal, unsafe.Pointer(A.Uintptr()), betaC, cDesc.internal, unsafe.Pointer(C_.Uintptr())))
 }
 
-// OpTensor implements the equation C = op ( alpha1[0] * A, alpha2[0] * B ) + beta[0] * C, given tensors A, B, and C and scaling factors alpha1, alpha2, and beta. The op to use is indicated by the descriptor opTensorDesc. Currently-supported ops are listed by the OpTensorOp_t enum.
-//	C_ is both an input and output
-func (co *Context) OpTensor(opTensorDesc *Op, alpha1 float64, aDesc *TensorDescriptor, A Memory, alpha2 float64, bDesc *TensorDescriptor, B Memory, beta float64, cDesc *TensorDescriptor, C_ Memory) error {
-	var alpha1C, alpha2C, betaC unsafe.Pointer
-	switch aDesc.dataType {
+// For more information, see cudnnDeriveBNTensorDescriptor() for the secondary tensor descriptor generation for the parameters used in this function.
+func (co *Context) BatchNormalizationBackward(mode BatchNormMode, alphaDataDiff float64, betaDataDiff float64, alphaParamDiff float64, betaParamDiff float64, xDesc *TensorDescriptor, x Memory, dyDesc *TensorDescriptor, dy Memory, dxDesc *TensorDescriptor, dx Memory, dBnScaleBiasDesc *TensorDescriptor, bnScale Memory, dBnScaleResult Memory, dBnBiasResult Memory, epsilon float64, savedMean Memory, savedInvVariance Memory) error {
+	var alphaDataDiffC, betaDataDiffC, alphaParamDiffC, betaParamDiffC unsafe.Pointer
+	switch xDesc.dataType {
 	case Float, Half:
-		var alpha1F, alpha2F, betaF C.float
-		alpha1F = C.float(float32(alpha1))
-		alpha2F = C.float(float32(alpha2))
-		betaF = C.float(float32(beta))
-		alpha1C = unsafe.Pointer(&alpha1F)
-		alpha2C = unsafe.Pointer(&alpha2F)
-		betaC = unsafe.Pointer(&betaF)
+		var alphaDataDiffF, betaDataDiffF, alphaParamDiffF, betaParamDiffF C.float
+		alphaDataDiffF = C.float(float32(alphaDataDiff))
+		betaDataDiffF = C.float(float32(betaDataDiff))
+		alphaParamDiffF = C.float(float32(alphaParamDiff))
+		betaParamDiffF = C.float(float32(betaParamDiff))
+		alphaDataDiffC = unsafe.Pointer(&alphaDataDiffF)
+		betaDataDiffC = unsafe.Pointer(&betaDataDiffF)
+		alphaParamDiffC = unsafe.Pointer(&alphaParamDiffF)
+		betaParamDiffC = unsafe.Pointer(&betaParamDiffF)
 	case Double:
-		var alpha1F, alpha2F, betaF C.double
-		alpha1F = C.double(alpha1)
-		alpha2F = C.double(alpha2)
-		betaF = C.double(beta)
-		alpha1C = unsafe.Pointer(&alpha1F)
-		alpha2C = unsafe.Pointer(&alpha2F)
-		betaC = unsafe.Pointer(&betaF)
+		var alphaDataDiffF, betaDataDiffF, alphaParamDiffF, betaParamDiffF C.double
+		alphaDataDiffF = C.double(alphaDataDiff)
+		betaDataDiffF = C.double(betaDataDiff)
+		alphaParamDiffF = C.double(alphaParamDiff)
+		betaParamDiffF = C.double(betaParamDiff)
+		alphaDataDiffC = unsafe.Pointer(&alphaDataDiffF)
+		betaDataDiffC = unsafe.Pointer(&betaDataDiffF)
+		alphaParamDiffC = unsafe.Pointer(&alphaParamDiffF)
+		betaParamDiffC = unsafe.Pointer(&betaParamDiffF)
 	default:
-		return errors.Errorf("Unsupported data type: %v", aDesc.dataType)
+		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
 	}
-	// call cudnnOpTensor
-	return result(C.cudnnOpTensor(co.internal, opTensorDesc.internal, alpha1C, aDesc.internal, A.Pointer(), alpha2C, bDesc.internal, B.Pointer(), betaC, cDesc.internal, C_.Pointer()))
-}
-
-// GetReductionIndicesSize is a helper function to return the minimum size of the index space to be passed to the reduction given the input and output tensors.
-func (co *Context) GetReductionIndicesSize(reduceTensorDesc *Reduction, aDesc *TensorDescriptor, cDesc *TensorDescriptor) (sizeInBytes uintptr, err error) {
-	var sizeInBytesC C.size_t
-	// call cudnnGetReductionIndicesSize
-	err = result(C.cudnnGetReductionIndicesSize(co.internal, reduceTensorDesc.internal, aDesc.internal, cDesc.internal, &sizeInBytesC))
-	sizeInBytes = uintptr(sizeInBytesC)
-	return
-}
-
-// GetReductionWorkspaceSize is a helper function to return the minimum size of the workspace to be passed to the reduction given the input and output tensors.
-func (co *Context) GetReductionWorkspaceSize(reduceTensorDesc *Reduction, aDesc *TensorDescriptor, cDesc *TensorDescriptor) (sizeInBytes uintptr, err error) {
-	var sizeInBytesC C.size_t
-	// call cudnnGetReductionWorkspaceSize
-	err = result(C.cudnnGetReductionWorkspaceSize(co.internal, reduceTensorDesc.internal, aDesc.internal, cDesc.internal, &sizeInBytesC))
-	sizeInBytes = uintptr(sizeInBytesC)
-	return
+	// call cudnnBatchNormalizationBackward
+	return result(C.cudnnBatchNormalizationBackward(co.internal, mode.C(), alphaDataDiffC, betaDataDiffC, alphaParamDiffC, betaParamDiffC, xDesc.internal, unsafe.Pointer(x.Uintptr()), dyDesc.internal, unsafe.Pointer(dy.Uintptr()), dxDesc.internal, unsafe.Pointer(dx.Uintptr()), dBnScaleBiasDesc.internal, unsafe.Pointer(bnScale.Uintptr()), unsafe.Pointer(dBnScaleResult.Uintptr()), unsafe.Pointer(dBnBiasResult.Uintptr()), C.double(epsilon), unsafe.Pointer(savedMean.Uintptr()), unsafe.Pointer(savedInvVariance.Uintptr())))
 }
 
-// ReduceTensor reduces tensor A by implementing the equation C = alpha * reduce op ( A ) + beta * C, given tensors A and C and scaling factors alpha and beta. The reduction op to use is indicated by the descriptor reduceTensorDesc. Currently-supported ops are listed by the ReduceTensorOp_t enum.
-//	C_ is both an input and output
-func (co *Context) ReduceTensor(reduceTensorDesc *Reduction, indices Memory, indicesSizeInBytes uintptr, workspace Memory, workspaceSizeInBytes uintptr, alpha float64, aDesc *TensorDescriptor, A Memory, beta float64, cDesc *TensorDescriptor, C_ Memory) error {
-	// DOUBLECHECK: "cudnnReduceTensor" returns Memory type in Parameter 2
+// Input. Handle to a previously created cuDNN library descriptor. For more information, see cudnnHandle_t.
+func (co *Context) BatchNormalizationForwardInference(mode BatchNormMode, alpha float64, beta float64, xDesc *TensorDescriptor, x Memory, yDesc *TensorDescriptor, y Memory, bnScaleBiasMeanVarDesc *TensorDescriptor, bnScale Memory, bnBias Memory, estimatedMean Memory, estimatedVariance Memory, epsilon float64) error {
 	var alphaC, betaC unsafe.Pointer
-	switch aDesc.dataType {
+	switch xDesc.dataType {
 	case Float, Half:
 		var alphaF, betaF C.float
 		alphaF = C.float(float32(alpha))
@@ -137,59 +130,41 @@ func (co *Context) ReduceTensor(reduceTensorDesc *Reduction, indices Memory, ind
 		alphaC = unsafe.Pointer(&alphaF)
 		betaC = unsafe.Pointer(&betaF)
 	default:
-		return errors.Errorf("Unsupported data type: %v", aDesc.dataType)
+		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
 	}
-	// call cudnnReduceTensor
-	return result(C.cudnnReduceTensor(co.internal, reduceTensorDesc.internal, indices.Pointer(), C.size_t(indicesSizeInBytes), workspace.Pointer(), C.size_t(workspaceSizeInBytes), alphaC, aDesc.internal, A.Pointer(), betaC, cDesc.internal, C_.Pointer()))
+	// call cudnnBatchNormalizationForwardInference
+	return result(C.cudnnBatchNormalizationForwardInference(co.internal, mode.C(), alphaC, betaC, xDesc.internal, unsafe.Pointer(x.Uintptr()), yDesc.internal, unsafe.Pointer(y.Uintptr()), bnScaleBiasMeanVarDesc.internal, unsafe.Pointer(bnScale.Uintptr()), unsafe.Pointer(bnBias.Uintptr()), unsafe.Pointer(estimatedMean.Uintptr()), unsafe.Pointer(estimatedVariance.Uintptr()), C.double(epsilon)))
 }
 
-// ScaleTensor scale all the elements of a tensor by a given factor.
-//	y is both an input and output
-func (co *Context) ScaleTensor(yDesc *TensorDescriptor, y Memory, alpha float64) error {
-	var alphaC unsafe.Pointer
-	switch yDesc.dataType {
+// Handle to a previously created cuDNN library descriptor. For more information, see cudnnHandle_t.
+func (co *Context) BatchNormalizationForwardTraining(mode BatchNormMode, alpha float64, beta float64, xDesc *TensorDescriptor, x Memory, yDesc *TensorDescriptor, y Memory, bnScaleBiasMeanVarDesc *TensorDescriptor, bnScale Memory, bnBias Memory, exponentialAverageFactor float64, resultRunningMean Memory, resultRunningVariance Memory, epsilon float64, resultSaveMean Memory, resultSaveInvVariance Memory) error {
+	// DOUBLECHECK: "cudnnBatchNormalizationForwardTraining" returns Memory type in Parameter 16
+	var alphaC, betaC unsafe.Pointer
+	switch xDesc.dataType {
 	case Float, Half:
-		var alphaF C.float
+		var alphaF, betaF C.float
 		alphaF = C.float(float32(alpha))
+		betaF = C.float(float32(beta))
 		alphaC = unsafe.Pointer(&alphaF)
+		betaC = unsafe.Pointer(&betaF)
 	case Double:
-		var alphaF C.double
+		var alphaF, betaF C.double
 		alphaF = C.double(alpha)
+		betaF = C.double(beta)
 		alphaC = unsafe.Pointer(&alphaF)
+		betaC = unsafe.Pointer(&betaF)
 	default:
-		return errors.Errorf("Unsupported data type: %v", yDesc.dataType)
+		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
 	}
-	// call cudnnScaleTensor
-	return result(C.cudnnScaleTensor(co.internal, yDesc.internal, y.Pointer(), alphaC))
-}
-
-// // FindConvolutionForwardAlgorithm attempts all cuDNN algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) for cudnnConvolutionForward(), using memory allocated via cudaMalloc(), and outputs performance metrics to a user-allocated array of cudnnConvolutionFwdAlgoPerf_t. These metrics are written in sorted fashion where the first element has the lowest compute time. The total number of resulting algorithms can be queried through the API cudnnGetConvolutionForwardMaxCount().
-// func (co *Context) FindConvolutionForwardAlgorithm(xDesc *TensorDescriptor, wDesc *Filter, convDesc *Convolution, yDesc *TensorDescriptor, requestedAlgoCount int) (returnedAlgoCount int, perfResults *ConvolutionFwdPerf, err error) {
-// 	var returnedAlgoCountC C.int
-// 	perfResults = new(ConvolutionFwdPerf)
-// 	// TODO: perfResults cudnnConvolutionFwdAlgoPerf_t
-// 	// call cudnnFindConvolutionForwardAlgorithm
-// 	err = result(C.cudnnFindConvolutionForwardAlgorithm(co.internal, xDesc.internal, wDesc.internal, convDesc.internal, yDesc.internal, C.int(requestedAlgoCount), &returnedAlgoCountC, perfResults.internal))
-// 	returnedAlgoCount = int(returnedAlgoCountC)
-// 	return
-// }
-
-// // FindConvolutionForwardAlgorithmEx attempts all available cuDNN algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) for cudnnConvolutionForward, using user-allocated GPU memory, and outputs performance metrics to a user-allocated array of cudnnConvolutionFwdAlgoPerf_t. These metrics are written in sorted fashion where the first element has the lowest compute time. The total number of resulting algorithms can be queried through the API cudnnGetConvolutionForwardMaxCount().
-// //	y is both an input and output
-// func (co *Context) FindConvolutionForwardAlgorithmEx(xDesc *TensorDescriptor, x Memory, wDesc *Filter, w Memory, convDesc *Convolution, yDesc *TensorDescriptor, y Memory, requestedAlgoCount int, workSpace Memory, workSpaceSizeInBytes uintptr) (returnedAlgoCount int, perfResults *ConvolutionFwdPerf, err error) {
-// 	var returnedAlgoCountC C.int
-// 	// TODO: perfResults cudnnConvolutionFwdAlgoPerf_t
-// 	// call cudnnFindConvolutionForwardAlgorithmEx
-// 	err = result(C.cudnnFindConvolutionForwardAlgorithmEx(co.internal, xDesc.internal, x.Pointer(), wDesc.internal, w.Pointer(), convDesc.internal, yDesc.internal, y.Pointer(), C.int(requestedAlgoCount), &returnedAlgoCountC, perfResults.internal, workSpace.Pointer(), C.size_t(workSpaceSizeInBytes)))
-// 	returnedAlgoCount = int(returnedAlgoCountC)
-// 	return
-// }
+	// call cudnnBatchNormalizationForwardTraining
+	return result(C.cudnnBatchNormalizationForwardTraining(co.internal, mode.C(), alphaC, betaC, xDesc.internal, unsafe.Pointer(x.Uintptr()), yDesc.internal, unsafe.Pointer(y.Uintptr()), bnScaleBiasMeanVarDesc.internal, unsafe.Pointer(bnScale.Uintptr()), unsafe.Pointer(bnBias.Uintptr()), C.double(exponentialAverageFactor), unsafe.Pointer(resultRunningMean.Uintptr()), unsafe.Pointer(resultRunningVariance.Uintptr()), C.double(epsilon), unsafe.Pointer(resultSaveMean.Uintptr()), unsafe.Pointer(resultSaveInvVariance.Uintptr())))
+}
 
-// ConvolutionForward executes convolutions or cross-correlations over x using filters specified with w, returning results in y. Scaling factors alpha and beta can be used to scale the input tensor and the output tensor respectively.
-//	y is both an input and output
-func (co *Context) ConvolutionForward(alpha float64, xDesc *TensorDescriptor, x Memory, wDesc *Filter, w Memory, convDesc *Convolution, algo ConvolutionFwdAlgo, workSpace Memory, workSpaceSizeInBytes uintptr, beta float64, yDesc *TensorDescriptor, y Memory) error {
+// ConvolutionBackwardBias computes the convolution function gradient with respect to the bias, which is the sum of every element belonging to the same feature map across all of the images of the input tensor. Therefore, the number of elements produced is equal to the number of features maps of the input tensor.
+func (co *Context) ConvolutionBackwardBias(alpha float64, dyDesc *TensorDescriptor, dy Memory, beta float64, dbDesc *TensorDescriptor, db Memory) error {
+	// DOUBLECHECK: "cudnnConvolutionBackwardBias" returns Memory type in Parameter 6
 	var alphaC, betaC unsafe.Pointer
-	switch xDesc.dataType {
+	switch dyDesc.dataType {
 	case Float, Half:
 		var alphaF, betaF C.float
 		alphaF = C.float(float32(alpha))
@@ -203,39 +178,15 @@ func (co *Context) ConvolutionForward(alpha float64, xDesc *TensorDescriptor, x
 		alphaC = unsafe.Pointer(&alphaF)
 		betaC = unsafe.Pointer(&betaF)
 	default:
-		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
-	}
-	// call cudnnConvolutionForward
-	return result(C.cudnnConvolutionForward(co.internal, alphaC, xDesc.internal, x.Pointer(), wDesc.internal, w.Pointer(), convDesc.internal, algo.C(), workSpace.Pointer(), C.size_t(workSpaceSizeInBytes), betaC, yDesc.internal, y.Pointer()))
-}
-
-// ConvolutionBiasActivationForward applies a bias and then an activation to the convolutions or cross-correlations of cudnnConvolutionForward(), returning results in y. The full computation follows the equation y = act ( alpha1 * conv(x) + alpha2 * z + bias ).
-//	y is both an input and output
-func (co *Context) ConvolutionBiasActivationForward(alpha1 float64, xDesc *TensorDescriptor, x Memory, wDesc *Filter, w Memory, convDesc *Convolution, algo ConvolutionFwdAlgo, workSpace Memory, workSpaceSizeInBytes uintptr, alpha2 float64, zDesc *TensorDescriptor, z Memory, biasDesc *TensorDescriptor, bias Memory, activationDesc *Activation, yDesc *TensorDescriptor, y Memory) error {
-	var alpha1C, alpha2C unsafe.Pointer
-	switch xDesc.dataType {
-	case Float, Half:
-		var alpha1F, alpha2F C.float
-		alpha1F = C.float(float32(alpha1))
-		alpha2F = C.float(float32(alpha2))
-		alpha1C = unsafe.Pointer(&alpha1F)
-		alpha2C = unsafe.Pointer(&alpha2F)
-	case Double:
-		var alpha1F, alpha2F C.double
-		alpha1F = C.double(alpha1)
-		alpha2F = C.double(alpha2)
-		alpha1C = unsafe.Pointer(&alpha1F)
-		alpha2C = unsafe.Pointer(&alpha2F)
-	default:
-		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
+		return errors.Errorf("Unsupported data type: %v", dyDesc.dataType)
 	}
-	// call cudnnConvolutionBiasActivationForward
-	return result(C.cudnnConvolutionBiasActivationForward(co.internal, alpha1C, xDesc.internal, x.Pointer(), wDesc.internal, w.Pointer(), convDesc.internal, algo.C(), workSpace.Pointer(), C.size_t(workSpaceSizeInBytes), alpha2C, zDesc.internal, z.Pointer(), biasDesc.internal, bias.Pointer(), activationDesc.internal, yDesc.internal, y.Pointer()))
+	// call cudnnConvolutionBackwardBias
+	return result(C.cudnnConvolutionBackwardBias(co.internal, alphaC, dyDesc.internal, unsafe.Pointer(dy.Uintptr()), betaC, dbDesc.internal, unsafe.Pointer(db.Uintptr())))
 }
 
-// ConvolutionBackwardBias computes the convolution function gradient with respect to the bias, which is the sum of every element belonging to the same feature map across all of the images of the input tensor. Therefore, the number of elements produced is equal to the number of features maps of the input tensor.
-func (co *Context) ConvolutionBackwardBias(alpha float64, dyDesc *TensorDescriptor, dy Memory, beta float64, dbDesc *TensorDescriptor, db Memory) error {
-	// DOUBLECHECK: "cudnnConvolutionBackwardBias" returns Memory type in Parameter 6
+// ConvolutionBackwardData computes the convolution data gradient of the tensor dy, where y is the output of the forward convolution in cudnnConvolutionForward(). It uses the specified algo, and returns the results in the output tensor dx. Scaling factors alpha and beta can be used to scale the computed result or accumulate with the current dx.
+//	dx is both an input and output
+func (co *Context) ConvolutionBackwardData(alpha float64, wDesc *Filter, w Memory, dyDesc *TensorDescriptor, dy Memory, convDesc *Convolution, algo ConvolutionBwdDataAlgo, workSpace Memory, workSpaceSizeInBytes uintptr, beta float64, dxDesc *TensorDescriptor, dx Memory) error {
 	var alphaC, betaC unsafe.Pointer
 	switch dyDesc.dataType {
 	case Float, Half:
@@ -253,32 +204,11 @@ func (co *Context) ConvolutionBackwardBias(alpha float64, dyDesc *TensorDescript
 	default:
 		return errors.Errorf("Unsupported data type: %v", dyDesc.dataType)
 	}
-	// call cudnnConvolutionBackwardBias
-	return result(C.cudnnConvolutionBackwardBias(co.internal, alphaC, dyDesc.internal, dy.Pointer(), betaC, dbDesc.internal, db.Pointer()))
-}
-
-// // FindConvolutionBackwardFilterAlgorithm attempts all cuDNN algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) for cudnnConvolutionBackwardFilter(), using GPU memory allocated via cudaMalloc(), and outputs performance metrics to a user-allocated array of cudnnConvolutionBwdFilterAlgoPerf_t. These metrics are written in sorted fashion where the first element has the lowest compute time. The total number of resulting algorithms can be queried through the API cudnnGetConvolutionBackwardMaxCount().
-// func (co *Context) FindConvolutionBackwardFilterAlgorithm(xDesc *TensorDescriptor, dyDesc *TensorDescriptor, convDesc *Convolution, dwDesc *Filter, requestedAlgoCount int) (returnedAlgoCount int, perfResults *ConvolutionBwdPerf, err error) {
-// 	var returnedAlgoCountC C.int
-// 	// TODO: perfResults cudnnConvolutionBwdFilterAlgoPerf_t
-// 	// call cudnnFindConvolutionBackwardFilterAlgorithm
-// 	err = result(C.cudnnFindConvolutionBackwardFilterAlgorithm(co.internal, xDesc.internal, dyDesc.internal, convDesc.internal, dwDesc.internal, C.int(requestedAlgoCount), &returnedAlgoCountC, perfResults.internal))
-// 	returnedAlgoCount = int(returnedAlgoCountC)
-// 	return
-// }
-
-// // FindConvolutionBackwardFilterAlgorithmEx attempts all cuDNN algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) for cudnnConvolutionBackwardFilter, using user-allocated GPU memory, and outputs performance metrics to a user-allocated array of cudnnConvolutionBwdFilterAlgoPerf_t. These metrics are written in sorted fashion where the first element has the lowest compute time. The total number of resulting algorithms can be queried through the API cudnnGetConvolutionBackwardMaxCount().
-// //	dw is both an input and output
-// func (co *Context) FindConvolutionBackwardFilterAlgorithmEx(xDesc *TensorDescriptor, x Memory, dyDesc *TensorDescriptor, y Memory, convDesc *Convolution, dwDesc *Filter, dw Memory, requestedAlgoCount int, workSpace Memory, workSpaceSizeInBytes uintptr) (returnedAlgoCount int, perfResults *ConvolutionBwdPerf, err error) {
-// 	var returnedAlgoCountC C.int
-// 	// TODO: perfResults cudnnConvolutionBwdFilterAlgoPerf_t
-// 	// call cudnnFindConvolutionBackwardFilterAlgorithmEx
-// 	err = result(C.cudnnFindConvolutionBackwardFilterAlgorithmEx(co.internal, xDesc.internal, x.Pointer(), dyDesc.internal, y.Pointer(), convDesc.internal, dwDesc.internal, dw.Pointer(), C.int(requestedAlgoCount), &returnedAlgoCountC, perfResults.internal, workSpace.Pointer(), C.size_t(workSpaceSizeInBytes)))
-// 	returnedAlgoCount = int(returnedAlgoCountC)
-// 	return
-// }
-
-// ConvolutionBackwardFilter computes the convolution gradient with respect to filter coefficients using the specified algo, returning results in gradDesc.Scaling factors alpha and beta can be used to scale the input tensor and the output tensor respectively.
+	// call cudnnConvolutionBackwardData
+	return result(C.cudnnConvolutionBackwardData(co.internal, alphaC, wDesc.internal, unsafe.Pointer(w.Uintptr()), dyDesc.internal, unsafe.Pointer(dy.Uintptr()), convDesc.internal, algo.C(), unsafe.Pointer(workSpace.Uintptr()), C.size_t(workSpaceSizeInBytes), betaC, dxDesc.internal, unsafe.Pointer(dx.Uintptr())))
+}
+
+// ConvolutionBackwardFilter computes the convolution weight (filter) gradient of the tensor dy, where y is the output of the forward convolution in cudnnConvolutionForward(). It uses the specified algo, and returns the results in the output tensor dw. Scaling factors alpha and beta can be used to scale the computed result or accumulate with the current dw.
 //	dw is both an input and output
 func (co *Context) ConvolutionBackwardFilter(alpha float64, xDesc *TensorDescriptor, x Memory, dyDesc *TensorDescriptor, dy Memory, convDesc *Convolution, algo ConvolutionBwdFilterAlgo, workSpace Memory, workSpaceSizeInBytes uintptr, beta float64, dwDesc *Filter, dw Memory) error {
 	var alphaC, betaC unsafe.Pointer
@@ -299,35 +229,38 @@ func (co *Context) ConvolutionBackwardFilter(alpha float64, xDesc *TensorDescrip
 		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
 	}
 	// call cudnnConvolutionBackwardFilter
-	return result(C.cudnnConvolutionBackwardFilter(co.internal, alphaC, xDesc.internal, x.Pointer(), dyDesc.internal, dy.Pointer(), convDesc.internal, algo.C(), workSpace.Pointer(), C.size_t(workSpaceSizeInBytes), betaC, dwDesc.internal, dw.Pointer()))
-}
-
-// // FindConvolutionBackwardDataAlgorithm attempts all cuDNN algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) for cudnnConvolutionBackwardData(), using memory allocated via cudaMalloc() and outputs performance metrics to a user-allocated array of cudnnConvolutionBwdDataAlgoPerf_t. These metrics are written in sorted fashion where the first element has the lowest compute time. The total number of resulting algorithms can be queried through the API cudnnGetConvolutionBackwardMaxCount().
-// func (co *Context) FindConvolutionBackwardDataAlgorithm(wDesc *Filter, dyDesc *TensorDescriptor, convDesc *Convolution, dxDesc *TensorDescriptor, requestedAlgoCount int) (returnedAlgoCount int, perfResults *ConvolutionBwdDataPerf, err error) {
-// 	var returnedAlgoCountC C.int
-// 	// TODO: perfResults cudnnConvolutionBwdDataAlgoPerf_t
-// 	// call cudnnFindConvolutionBackwardDataAlgorithm
-// 	err = result(C.cudnnFindConvolutionBackwardDataAlgorithm(co.internal, wDesc.internal, dyDesc.internal, convDesc.internal, dxDesc.internal, C.int(requestedAlgoCount), &returnedAlgoCountC, perfResults.internal))
-// 	returnedAlgoCount = int(returnedAlgoCountC)
-// 	return
-// }
-
-// // FindConvolutionBackwardDataAlgorithmEx attempts all cuDNN algorithms (including CUDNN_TENSOR_OP_MATH and CUDNN_DEFAULT_MATH versions of algorithms where CUDNN_TENSOR_OP_MATH may be available) for cudnnConvolutionBackwardData, using user-allocated GPU memory, and outputs performance metrics to a user-allocated array of cudnnConvolutionBwdDataAlgoPerf_t. These metrics are written in sorted fashion where the first element has the lowest compute time. The total number of resulting algorithms can be queried through the API cudnnGetConvolutionBackwardMaxCount().
-// //	dxDesc is both an input and output
-// func (co *Context) FindConvolutionBackwardDataAlgorithmEx(wDesc *Filter, w Memory, dyDesc *TensorDescriptor, dy Memory, convDesc *Convolution, dxDesc *TensorDescriptor, dx Memory, requestedAlgoCount int, workSpace Memory, workSpaceSizeInBytes uintptr) (returnedAlgoCount int, perfResults *ConvolutionBwdDataPerf, err error) {
-// 	var returnedAlgoCountC C.int
-// 	// TODO: perfResults cudnnConvolutionBwdDataAlgoPerf_t
-// 	// call cudnnFindConvolutionBackwardDataAlgorithmEx
-// 	err = result(C.cudnnFindConvolutionBackwardDataAlgorithmEx(co.internal, wDesc.internal, w.Pointer(), dyDesc.internal, dy.Pointer(), convDesc.internal, dxDesc.internal, dx.Pointer(), C.int(requestedAlgoCount), &returnedAlgoCountC, perfResults.internal, workSpace.Pointer(), C.size_t(workSpaceSizeInBytes)))
-// 	returnedAlgoCount = int(returnedAlgoCountC)
-// 	return
-// }
-
-// ConvolutionBackwardData computes the convolution gradient with respect to the output tensor using the specified algo, returning results in gradDesc. Scaling factors alpha and beta can be used to scale the input tensor and the output tensor respectively.
-//	dx is both an input and output
-func (co *Context) ConvolutionBackwardData(alpha float64, wDesc *Filter, w Memory, dyDesc *TensorDescriptor, dy Memory, convDesc *Convolution, algo ConvolutionBwdDataAlgo, workSpace Memory, workSpaceSizeInBytes uintptr, beta float64, dxDesc *TensorDescriptor, dx Memory) error {
+	return result(C.cudnnConvolutionBackwardFilter(co.internal, alphaC, xDesc.internal, unsafe.Pointer(x.Uintptr()), dyDesc.internal, unsafe.Pointer(dy.Uintptr()), convDesc.internal, algo.C(), unsafe.Pointer(workSpace.Uintptr()), C.size_t(workSpaceSizeInBytes), betaC, dwDesc.internal, unsafe.Pointer(dw.Uintptr())))
+}
+
+// Input. Handle to a previously created cuDNN context. For more information, see cudnnHandle_t.
+//	y is both an input and output
+func (co *Context) ConvolutionBiasActivationForward(alpha1 float64, xDesc *TensorDescriptor, x Memory, wDesc *Filter, w Memory, convDesc *Convolution, algo ConvolutionFwdAlgo, workSpace Memory, workSpaceSizeInBytes uintptr, alpha2 float64, zDesc *TensorDescriptor, z Memory, biasDesc *TensorDescriptor, bias Memory, activationDesc *Activation, yDesc *TensorDescriptor, y Memory) error {
+	var alpha1C, alpha2C unsafe.Pointer
+	switch xDesc.dataType {
+	case Float, Half:
+		var alpha1F, alpha2F C.float
+		alpha1F = C.float(float32(alpha1))
+		alpha2F = C.float(float32(alpha2))
+		alpha1C = unsafe.Pointer(&alpha1F)
+		alpha2C = unsafe.Pointer(&alpha2F)
+	case Double:
+		var alpha1F, alpha2F C.double
+		alpha1F = C.double(alpha1)
+		alpha2F = C.double(alpha2)
+		alpha1C = unsafe.Pointer(&alpha1F)
+		alpha2C = unsafe.Pointer(&alpha2F)
+	default:
+		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
+	}
+	// call cudnnConvolutionBiasActivationForward
+	return result(C.cudnnConvolutionBiasActivationForward(co.internal, alpha1C, xDesc.internal, unsafe.Pointer(x.Uintptr()), wDesc.internal, unsafe.Pointer(w.Uintptr()), convDesc.internal, algo.C(), unsafe.Pointer(workSpace.Uintptr()), C.size_t(workSpaceSizeInBytes), alpha2C, zDesc.internal, unsafe.Pointer(z.Uintptr()), biasDesc.internal, unsafe.Pointer(bias.Uintptr()), activationDesc.internal, yDesc.internal, unsafe.Pointer(y.Uintptr())))
+}
+
+// ConvolutionForward executes convolutions or cross-correlations over x using filters specified with w, returning results in y. Scaling factors alpha and beta can be used to scale the input tensor and the output tensor respectively.
+//	y is both an input and output
+func (co *Context) ConvolutionForward(alpha float64, xDesc *TensorDescriptor, x Memory, wDesc *Filter, w Memory, convDesc *Convolution, algo ConvolutionFwdAlgo, workSpace Memory, workSpaceSizeInBytes uintptr, beta float64, yDesc *TensorDescriptor, y Memory) error {
 	var alphaC, betaC unsafe.Pointer
-	switch dyDesc.dataType {
+	switch xDesc.dataType {
 	case Float, Half:
 		var alphaF, betaF C.float
 		alphaF = C.float(float32(alpha))
@@ -341,22 +274,15 @@ func (co *Context) ConvolutionBackwardData(alpha float64, wDesc *Filter, w Memor
 		alphaC = unsafe.Pointer(&alphaF)
 		betaC = unsafe.Pointer(&betaF)
 	default:
-		return errors.Errorf("Unsupported data type: %v", dyDesc.dataType)
+		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
 	}
-	// call cudnnConvolutionBackwardData
-	return result(C.cudnnConvolutionBackwardData(co.internal, alphaC, wDesc.internal, w.Pointer(), dyDesc.internal, dy.Pointer(), convDesc.internal, algo.C(), workSpace.Pointer(), C.size_t(workSpaceSizeInBytes), betaC, dxDesc.internal, dx.Pointer()))
-}
-
-// Im2Col constructs the A matrix necessary to perform a forward pass of GEMM convolution. Im2Col A matrix has a height of batch_size*y_height*y_width and width of input_channels*filter_height*filter_width, where batch_size is xDesc's first dimension, y_height/y_width are computed from cudnnGetConvolutionNdForwardOutputDim(), input_channels is xDesc's second dimension, filter_height/filter_width are wDesc's third and fourth dimension. The A matrix is stored in format HW-fully-packed in GPU memory.
-func (co *Context) Im2Col(xDesc *TensorDescriptor, x Memory, wDesc *Filter, convDesc *Convolution, colBuffer Memory) error {
-	// DOUBLECHECK: "cudnnIm2Col" returns Memory type in Parameter 5
-	// call cudnnIm2Col
-	return result(C.cudnnIm2Col(co.internal, xDesc.internal, x.Pointer(), wDesc.internal, convDesc.internal, colBuffer.Pointer()))
+	// call cudnnConvolutionForward
+	return result(C.cudnnConvolutionForward(co.internal, alphaC, xDesc.internal, unsafe.Pointer(x.Uintptr()), wDesc.internal, unsafe.Pointer(w.Uintptr()), convDesc.internal, algo.C(), unsafe.Pointer(workSpace.Uintptr()), C.size_t(workSpaceSizeInBytes), betaC, yDesc.internal, unsafe.Pointer(y.Uintptr())))
 }
 
-// SoftmaxForward computes the softmax function.
-func (co *Context) SoftmaxForward(algo SoftmaxAlgorithm, mode SoftmaxMode, alpha float64, xDesc *TensorDescriptor, x Memory, beta float64, yDesc *TensorDescriptor, y Memory) error {
-	// DOUBLECHECK: "cudnnSoftmaxForward" returns Memory type in Parameter 8
+// DivisiveNormalizationBackward performs the backward DivisiveNormalization layer computation.
+func (co *Context) DivisiveNormalizationBackward(normDesc *LRN, mode DivNormMode, alpha float64, xDesc *TensorDescriptor, x Memory, means Memory, dy Memory, temp Memory, temp2 Memory, beta float64, dXdMeansDesc *TensorDescriptor, dx Memory, dMeans Memory) error {
+	// DOUBLECHECK: "cudnnDivisiveNormalizationBackward" returns Memory type in Parameter 13
 	var alphaC, betaC unsafe.Pointer
 	switch xDesc.dataType {
 	case Float, Half:
@@ -374,15 +300,15 @@ func (co *Context) SoftmaxForward(algo SoftmaxAlgorithm, mode SoftmaxMode, alpha
 	default:
 		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
 	}
-	// call cudnnSoftmaxForward
-	return result(C.cudnnSoftmaxForward(co.internal, algo.C(), mode.C(), alphaC, xDesc.internal, x.Pointer(), betaC, yDesc.internal, y.Pointer()))
+	// call cudnnDivisiveNormalizationBackward
+	return result(C.cudnnDivisiveNormalizationBackward(co.internal, normDesc.internal, mode.C(), alphaC, xDesc.internal, unsafe.Pointer(x.Uintptr()), unsafe.Pointer(means.Uintptr()), unsafe.Pointer(dy.Uintptr()), unsafe.Pointer(temp.Uintptr()), unsafe.Pointer(temp2.Uintptr()), betaC, dXdMeansDesc.internal, unsafe.Pointer(dx.Uintptr()), unsafe.Pointer(dMeans.Uintptr())))
 }
 
-// SoftmaxBackward computes the gradient of the softmax function.
-func (co *Context) SoftmaxBackward(algo SoftmaxAlgorithm, mode SoftmaxMode, alpha float64, yDesc *TensorDescriptor, y Memory, dyDesc *TensorDescriptor, dy Memory, beta float64, dxDesc *TensorDescriptor, dx Memory) error {
-	// DOUBLECHECK: "cudnnSoftmaxBackward" returns Memory type in Parameter 10
+// The x-mean(x) which is often referred to as `subtractive normalization` portion of the computation can be implemented using cuDNN average pooling layer followed by a call to addTensor.
+func (co *Context) DivisiveNormalizationForward(normDesc *LRN, mode DivNormMode, alpha float64, xDesc *TensorDescriptor, x Memory, means Memory, temp Memory, temp2 Memory, beta float64, yDesc *TensorDescriptor, y Memory) error {
+	// DOUBLECHECK: "cudnnDivisiveNormalizationForward" returns Memory type in Parameter 11
 	var alphaC, betaC unsafe.Pointer
-	switch yDesc.dataType {
+	switch xDesc.dataType {
 	case Float, Half:
 		var alphaF, betaF C.float
 		alphaF = C.float(float32(alpha))
@@ -396,17 +322,128 @@ func (co *Context) SoftmaxBackward(algo SoftmaxAlgorithm, mode SoftmaxMode, alph
 		alphaC = unsafe.Pointer(&alphaF)
 		betaC = unsafe.Pointer(&betaF)
 	default:
-		return errors.Errorf("Unsupported data type: %v", yDesc.dataType)
+		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
 	}
-	// call cudnnSoftmaxBackward
-	return result(C.cudnnSoftmaxBackward(co.internal, algo.C(), mode.C(), alphaC, yDesc.internal, y.Pointer(), dyDesc.internal, dy.Pointer(), betaC, dxDesc.internal, dx.Pointer()))
+	// call cudnnDivisiveNormalizationForward
+	return result(C.cudnnDivisiveNormalizationForward(co.internal, normDesc.internal, mode.C(), alphaC, xDesc.internal, unsafe.Pointer(x.Uintptr()), unsafe.Pointer(means.Uintptr()), unsafe.Pointer(temp.Uintptr()), unsafe.Pointer(temp2.Uintptr()), betaC, yDesc.internal, unsafe.Pointer(y.Uintptr())))
 }
 
-// PoolingForward computes pooling of input values (i.e., the maximum or average of several adjacent values) to produce an output with smaller height and/or width.
-func (co *Context) PoolingForward(poolingDesc *Pooling, alpha float64, xDesc *TensorDescriptor, x Memory, beta float64, yDesc *TensorDescriptor, y Memory) error {
-	// DOUBLECHECK: "cudnnPoolingForward" returns Memory type in Parameter 7
+// DropoutBackward performs backward dropout operation over dy returning results in dx. If during forward dropout operation value from x was propagated to y then during backward operation value from dy will be propagated to dx, otherwise, dx value will be set to 0.
+func (co *Context) DropoutBackward(dropoutDesc *Dropout, dydesc *TensorDescriptor, dy Memory, dxdesc *TensorDescriptor, dx Memory, reserveSpace Memory, reserveSpaceSizeInBytes uintptr) error {
+	// DOUBLECHECK: "cudnnDropoutBackward" returns Memory type in Parameter 5
+	// call cudnnDropoutBackward
+	return result(C.cudnnDropoutBackward(co.internal, dropoutDesc.internal, dydesc.internal, unsafe.Pointer(dy.Uintptr()), dxdesc.internal, unsafe.Pointer(dx.Uintptr()), unsafe.Pointer(reserveSpace.Uintptr()), C.size_t(reserveSpaceSizeInBytes)))
+}
+
+// DropoutForward performs forward dropout operation over x returning results in y. If dropout was used as a parameter to cudnnSetDropoutDescriptor(), the approximately dropout fraction of x values will be replaced by a 0, and the rest will be scaled by 1/(1-dropout). DropoutForward should not be running concurrently with another DropoutForward() function using the same states.
+func (co *Context) DropoutForward(dropoutDesc *Dropout, xdesc *TensorDescriptor, x Memory, ydesc *TensorDescriptor, y Memory, reserveSpace Memory, reserveSpaceSizeInBytes uintptr) error {
+	// DOUBLECHECK: "cudnnDropoutForward" returns Memory type in Parameter 6
+	// call cudnnDropoutForward
+	return result(C.cudnnDropoutForward(co.internal, dropoutDesc.internal, xdesc.internal, unsafe.Pointer(x.Uintptr()), ydesc.internal, unsafe.Pointer(y.Uintptr()), unsafe.Pointer(reserveSpace.Uintptr()), C.size_t(reserveSpaceSizeInBytes)))
+}
+
+// DropoutGetStatesSize is used to query the amount of space required to store the states of the random number generators used by cudnnDropoutForward() function.
+func (co *Context) DropoutGetStatesSize() (sizeInBytes uintptr, err error) {
+	var sizeInBytesC C.size_t
+	// call cudnnDropoutGetStatesSize
+	err = result(C.cudnnDropoutGetStatesSize(co.internal, &sizeInBytesC))
+	sizeInBytes = uintptr(sizeInBytesC)
+	return
+}
+
+// FindConvolutionBackwardDataAlgorithm attempts all algorithms available for cudnnConvolutionBackwardData(). It will attempt both the provided convDescmathType and CUDNN_DEFAULT_MATH (assuming the two differ).
+func (co *Context) FindConvolutionBackwardDataAlgorithm(wDesc *Filter, dyDesc *TensorDescriptor, convDesc *Convolution, dxDesc *TensorDescriptor, requestedAlgoCount int) (returnedAlgoCount int, perfResults *ConvolutionBwdDataPerf, err error) {
+	var returnedAlgoCountC C.int
+	// TODO: perfResults cudnnConvolutionBwdDataAlgoPerf_t
+	// call cudnnFindConvolutionBackwardDataAlgorithm
+	err = result(C.cudnnFindConvolutionBackwardDataAlgorithm(co.internal, wDesc.internal, dyDesc.internal, convDesc.internal, dxDesc.internal, C.int(requestedAlgoCount), &returnedAlgoCountC, &perfResults.internal))
+	returnedAlgoCount = int(returnedAlgoCountC)
+	return
+}
+
+// FindConvolutionBackwardDataAlgorithmEx attempts all algorithms available for cudnnConvolutionBackwardData(). It will attempt both the provided convDescmathType and CUDNN_DEFAULT_MATH (assuming the two differ).
+//	dxDesc is both an input and output
+func (co *Context) FindConvolutionBackwardDataAlgorithmEx(wDesc *Filter, w Memory, dyDesc *TensorDescriptor, dy Memory, convDesc *Convolution, dxDesc *TensorDescriptor, dx Memory, requestedAlgoCount int, workSpace Memory, workSpaceSizeInBytes uintptr) (returnedAlgoCount int, perfResults *ConvolutionBwdDataPerf, err error) {
+	var returnedAlgoCountC C.int
+	// TODO: perfResults cudnnConvolutionBwdDataAlgoPerf_t
+	// call cudnnFindConvolutionBackwardDataAlgorithmEx
+	err = result(C.cudnnFindConvolutionBackwardDataAlgorithmEx(co.internal, wDesc.internal, unsafe.Pointer(w.Uintptr()), dyDesc.internal, unsafe.Pointer(dy.Uintptr()), convDesc.internal, dxDesc.internal, unsafe.Pointer(dx.Uintptr()), C.int(requestedAlgoCount), &returnedAlgoCountC, &perfResults.internal, unsafe.Pointer(workSpace.Uintptr()), C.size_t(workSpaceSizeInBytes)))
+	returnedAlgoCount = int(returnedAlgoCountC)
+	return
+}
+
+// FindConvolutionBackwardFilterAlgorithm attempts all algorithms available for cudnnConvolutionBackwardFilter(). It will attempt both the provided convDescmathType and CUDNN_DEFAULT_MATH (assuming the two differ).
+func (co *Context) FindConvolutionBackwardFilterAlgorithm(xDesc *TensorDescriptor, dyDesc *TensorDescriptor, convDesc *Convolution, dwDesc *Filter, requestedAlgoCount int) (returnedAlgoCount int, perfResults *ConvolutionBwdPerf, err error) {
+	var returnedAlgoCountC C.int
+	// TODO: perfResults cudnnConvolutionBwdFilterAlgoPerf_t
+	// call cudnnFindConvolutionBackwardFilterAlgorithm
+	err = result(C.cudnnFindConvolutionBackwardFilterAlgorithm(co.internal, xDesc.internal, dyDesc.internal, convDesc.internal, dwDesc.internal, C.int(requestedAlgoCount), &returnedAlgoCountC, &perfResults.internal))
+	returnedAlgoCount = int(returnedAlgoCountC)
+	return
+}
+
+// FindConvolutionBackwardFilterAlgorithmEx attempts all algorithms available for cudnnConvolutionBackwardFilter(). It will attempt both the provided convDescmathType and CUDNN_DEFAULT_MATH (assuming the two differ).
+//	dw is both an input and output
+func (co *Context) FindConvolutionBackwardFilterAlgorithmEx(xDesc *TensorDescriptor, x Memory, dyDesc *TensorDescriptor, y Memory, convDesc *Convolution, dwDesc *Filter, dw Memory, requestedAlgoCount int, workSpace Memory, workSpaceSizeInBytes uintptr) (returnedAlgoCount int, perfResults *ConvolutionBwdPerf, err error) {
+	var returnedAlgoCountC C.int
+	// TODO: perfResults cudnnConvolutionBwdFilterAlgoPerf_t
+	// call cudnnFindConvolutionBackwardFilterAlgorithmEx
+	err = result(C.cudnnFindConvolutionBackwardFilterAlgorithmEx(co.internal, xDesc.internal, unsafe.Pointer(x.Uintptr()), dyDesc.internal, unsafe.Pointer(y.Uintptr()), convDesc.internal, dwDesc.internal, unsafe.Pointer(dw.Uintptr()), C.int(requestedAlgoCount), &returnedAlgoCountC, &perfResults.internal, unsafe.Pointer(workSpace.Uintptr()), C.size_t(workSpaceSizeInBytes)))
+	returnedAlgoCount = int(returnedAlgoCountC)
+	return
+}
+
+// FindConvolutionForwardAlgorithm attempts all algorithms available for cudnnConvolutionForward(). It will attempt both the provided convDescmathType and CUDNN_DEFAULT_MATH (assuming the two differ).
+func (co *Context) FindConvolutionForwardAlgorithm(xDesc *TensorDescriptor, wDesc *Filter, convDesc *Convolution, yDesc *TensorDescriptor, requestedAlgoCount int) (returnedAlgoCount int, perfResults *ConvolutionFwdPerf, err error) {
+	var returnedAlgoCountC C.int
+	// TODO: perfResults cudnnConvolutionFwdAlgoPerf_t
+	// call cudnnFindConvolutionForwardAlgorithm
+	err = result(C.cudnnFindConvolutionForwardAlgorithm(co.internal, xDesc.internal, wDesc.internal, convDesc.internal, yDesc.internal, C.int(requestedAlgoCount), &returnedAlgoCountC, &perfResults.internal))
+	returnedAlgoCount = int(returnedAlgoCountC)
+	return
+}
+
+// FindConvolutionForwardAlgorithmEx attempts all algorithms available for cudnnConvolutionForward(). It will attempt both the provided convDescmathType and CUDNN_DEFAULT_MATH (assuming the two differ).
+//	y is both an input and output
+func (co *Context) FindConvolutionForwardAlgorithmEx(xDesc *TensorDescriptor, x Memory, wDesc *Filter, w Memory, convDesc *Convolution, yDesc *TensorDescriptor, y Memory, requestedAlgoCount int, workSpace Memory, workSpaceSizeInBytes uintptr) (returnedAlgoCount int, perfResults *ConvolutionFwdPerf, err error) {
+	var returnedAlgoCountC C.int
+	// TODO: perfResults cudnnConvolutionFwdAlgoPerf_t
+	// call cudnnFindConvolutionForwardAlgorithmEx
+	err = result(C.cudnnFindConvolutionForwardAlgorithmEx(co.internal, xDesc.internal, unsafe.Pointer(x.Uintptr()), wDesc.internal, unsafe.Pointer(w.Uintptr()), convDesc.internal, yDesc.internal, unsafe.Pointer(y.Uintptr()), C.int(requestedAlgoCount), &returnedAlgoCountC, &perfResults.internal, unsafe.Pointer(workSpace.Uintptr()), C.size_t(workSpaceSizeInBytes)))
+	returnedAlgoCount = int(returnedAlgoCountC)
+	return
+}
+
+// GetReductionIndicesSize is a helper function to return the minimum size of the index space to be passed to the reduction given the input and output tensors.
+func (co *Context) GetReductionIndicesSize(reduceTensorDesc *Reduction, aDesc *TensorDescriptor, cDesc *TensorDescriptor) (sizeInBytes uintptr, err error) {
+	var sizeInBytesC C.size_t
+	// call cudnnGetReductionIndicesSize
+	err = result(C.cudnnGetReductionIndicesSize(co.internal, reduceTensorDesc.internal, aDesc.internal, cDesc.internal, &sizeInBytesC))
+	sizeInBytes = uintptr(sizeInBytesC)
+	return
+}
+
+// GetReductionWorkspaceSize is a helper function to return the minimum size of the workspace to be passed to the reduction given the input and output tensors.
+func (co *Context) GetReductionWorkspaceSize(reduceTensorDesc *Reduction, aDesc *TensorDescriptor, cDesc *TensorDescriptor) (sizeInBytes uintptr, err error) {
+	var sizeInBytesC C.size_t
+	// call cudnnGetReductionWorkspaceSize
+	err = result(C.cudnnGetReductionWorkspaceSize(co.internal, reduceTensorDesc.internal, aDesc.internal, cDesc.internal, &sizeInBytesC))
+	sizeInBytes = uintptr(sizeInBytesC)
+	return
+}
+
+// Input. Handle to a previously created cuDNN context.
+func (co *Context) Im2Col(xDesc *TensorDescriptor, x Memory, wDesc *Filter, convDesc *Convolution, colBuffer Memory) error {
+	// DOUBLECHECK: "cudnnIm2Col" returns Memory type in Parameter 5
+	// call cudnnIm2Col
+	return result(C.cudnnIm2Col(co.internal, xDesc.internal, unsafe.Pointer(x.Uintptr()), wDesc.internal, convDesc.internal, unsafe.Pointer(colBuffer.Uintptr())))
+}
+
+// LRNCrossChannelBackward performs the backward LRN layer computation.
+func (co *Context) LRNCrossChannelBackward(normDesc *LRN, lrnMode LRNMode, alpha float64, yDesc *TensorDescriptor, y Memory, dyDesc *TensorDescriptor, dy Memory, xDesc *TensorDescriptor, x Memory, beta float64, dx Memory) (dxDesc *TensorDescriptor, err error) {
+	// DOUBLECHECK: "cudnnLRNCrossChannelBackward" returns Memory type in Parameter 12
 	var alphaC, betaC unsafe.Pointer
-	switch xDesc.dataType {
+	switch yDesc.dataType {
 	case Float, Half:
 		var alphaF, betaF C.float
 		alphaF = C.float(float32(alpha))
@@ -420,17 +457,20 @@ func (co *Context) PoolingForward(poolingDesc *Pooling, alpha float64, xDesc *Te
 		alphaC = unsafe.Pointer(&alphaF)
 		betaC = unsafe.Pointer(&betaF)
 	default:
-		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
+		err = errors.Errorf("Unsupported data type: %v", yDesc.dataType)
+		return
 	}
-	// call cudnnPoolingForward
-	return result(C.cudnnPoolingForward(co.internal, poolingDesc.internal, alphaC, xDesc.internal, x.Pointer(), betaC, yDesc.internal, y.Pointer()))
+	// TODO: dxDesc cudnnTensorDescriptor_t
+	// call cudnnLRNCrossChannelBackward
+	err = result(C.cudnnLRNCrossChannelBackward(co.internal, normDesc.internal, lrnMode.C(), alphaC, yDesc.internal, unsafe.Pointer(y.Uintptr()), dyDesc.internal, unsafe.Pointer(dy.Uintptr()), xDesc.internal, unsafe.Pointer(x.Uintptr()), betaC, dxDesc.internal, unsafe.Pointer(dx.Uintptr())))
+	return
 }
 
-// PoolingBackward computes the gradient of a pooling operation.
-func (co *Context) PoolingBackward(poolingDesc *Pooling, alpha float64, yDesc *TensorDescriptor, y Memory, dyDesc *TensorDescriptor, dy Memory, xDesc *TensorDescriptor, x Memory, beta float64, dxDesc *TensorDescriptor, dx Memory) error {
-	// DOUBLECHECK: "cudnnPoolingBackward" returns Memory type in Parameter 11
+// LRNCrossChannelForward performs the forward LRN layer computation.
+func (co *Context) LRNCrossChannelForward(normDesc *LRN, lrnMode LRNMode, alpha float64, xDesc *TensorDescriptor, x Memory, beta float64, yDesc *TensorDescriptor, y Memory) error {
+	// DOUBLECHECK: "cudnnLRNCrossChannelForward" returns Memory type in Parameter 8
 	var alphaC, betaC unsafe.Pointer
-	switch yDesc.dataType {
+	switch xDesc.dataType {
 	case Float, Half:
 		var alphaF, betaF C.float
 		alphaF = C.float(float32(alpha))
@@ -444,39 +484,43 @@ func (co *Context) PoolingBackward(poolingDesc *Pooling, alpha float64, yDesc *T
 		alphaC = unsafe.Pointer(&alphaF)
 		betaC = unsafe.Pointer(&betaF)
 	default:
-		return errors.Errorf("Unsupported data type: %v", yDesc.dataType)
+		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
 	}
-	// call cudnnPoolingBackward
-	return result(C.cudnnPoolingBackward(co.internal, poolingDesc.internal, alphaC, yDesc.internal, y.Pointer(), dyDesc.internal, dy.Pointer(), xDesc.internal, x.Pointer(), betaC, dxDesc.internal, dx.Pointer()))
+	// call cudnnLRNCrossChannelForward
+	return result(C.cudnnLRNCrossChannelForward(co.internal, normDesc.internal, lrnMode.C(), alphaC, xDesc.internal, unsafe.Pointer(x.Uintptr()), betaC, yDesc.internal, unsafe.Pointer(y.Uintptr())))
 }
 
-// ActivationForward applies a specified neuron activation function element-wise over each input value.
-func (co *Context) ActivationForward(activationDesc *Activation, alpha float64, xDesc *TensorDescriptor, x Memory, beta float64, yDesc *TensorDescriptor, y Memory) error {
-	// DOUBLECHECK: "cudnnActivationForward" returns Memory type in Parameter 7
-	var alphaC, betaC unsafe.Pointer
-	switch xDesc.dataType {
+// OpTensor implements the equation C = op(alpha1[0] * A, alpha2[0] * B) + beta[0] * C, given the tensors A, B, and C and the scaling factors alpha1, alpha2, and beta. The op to use is indicated by the descriptor OpTensorDescriptor_t, meaning, the type of opTensorDesc. Currently-supported ops are listed by the OpTensorOp_t enum.
+//	C_ is both an input and output
+func (co *Context) OpTensor(opTensorDesc *Op, alpha1 float64, aDesc *TensorDescriptor, A Memory, alpha2 float64, bDesc *TensorDescriptor, B Memory, beta float64, cDesc *TensorDescriptor, C_ Memory) error {
+	var alpha1C, alpha2C, betaC unsafe.Pointer
+	switch aDesc.dataType {
 	case Float, Half:
-		var alphaF, betaF C.float
-		alphaF = C.float(float32(alpha))
+		var alpha1F, alpha2F, betaF C.float
+		alpha1F = C.float(float32(alpha1))
+		alpha2F = C.float(float32(alpha2))
 		betaF = C.float(float32(beta))
-		alphaC = unsafe.Pointer(&alphaF)
+		alpha1C = unsafe.Pointer(&alpha1F)
+		alpha2C = unsafe.Pointer(&alpha2F)
 		betaC = unsafe.Pointer(&betaF)
 	case Double:
-		var alphaF, betaF C.double
-		alphaF = C.double(alpha)
+		var alpha1F, alpha2F, betaF C.double
+		alpha1F = C.double(alpha1)
+		alpha2F = C.double(alpha2)
 		betaF = C.double(beta)
-		alphaC = unsafe.Pointer(&alphaF)
+		alpha1C = unsafe.Pointer(&alpha1F)
+		alpha2C = unsafe.Pointer(&alpha2F)
 		betaC = unsafe.Pointer(&betaF)
 	default:
-		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
+		return errors.Errorf("Unsupported data type: %v", aDesc.dataType)
 	}
-	// call cudnnActivationForward
-	return result(C.cudnnActivationForward(co.internal, activationDesc.internal, alphaC, xDesc.internal, x.Pointer(), betaC, yDesc.internal, y.Pointer()))
+	// call cudnnOpTensor
+	return result(C.cudnnOpTensor(co.internal, opTensorDesc.internal, alpha1C, aDesc.internal, unsafe.Pointer(A.Uintptr()), alpha2C, bDesc.internal, unsafe.Pointer(B.Uintptr()), betaC, cDesc.internal, unsafe.Pointer(C_.Uintptr())))
 }
 
-// ActivationBackward computes the gradient of a neuron activation function.
-func (co *Context) ActivationBackward(activationDesc *Activation, alpha float64, yDesc *TensorDescriptor, y Memory, dyDesc *TensorDescriptor, dy Memory, xDesc *TensorDescriptor, x Memory, beta float64, dxDesc *TensorDescriptor, dx Memory) error {
-	// DOUBLECHECK: "cudnnActivationBackward" returns Memory type in Parameter 11
+// PoolingBackward computes the gradient of a pooling operation.
+func (co *Context) PoolingBackward(poolingDesc *Pooling, alpha float64, yDesc *TensorDescriptor, y Memory, dyDesc *TensorDescriptor, dy Memory, xDesc *TensorDescriptor, x Memory, beta float64, dxDesc *TensorDescriptor, dx Memory) error {
+	// DOUBLECHECK: "cudnnPoolingBackward" returns Memory type in Parameter 11
 	var alphaC, betaC unsafe.Pointer
 	switch yDesc.dataType {
 	case Float, Half:
@@ -494,13 +538,13 @@ func (co *Context) ActivationBackward(activationDesc *Activation, alpha float64,
 	default:
 		return errors.Errorf("Unsupported data type: %v", yDesc.dataType)
 	}
-	// call cudnnActivationBackward
-	return result(C.cudnnActivationBackward(co.internal, activationDesc.internal, alphaC, yDesc.internal, y.Pointer(), dyDesc.internal, dy.Pointer(), xDesc.internal, x.Pointer(), betaC, dxDesc.internal, dx.Pointer()))
+	// call cudnnPoolingBackward
+	return result(C.cudnnPoolingBackward(co.internal, poolingDesc.internal, alphaC, yDesc.internal, unsafe.Pointer(y.Uintptr()), dyDesc.internal, unsafe.Pointer(dy.Uintptr()), xDesc.internal, unsafe.Pointer(x.Uintptr()), betaC, dxDesc.internal, unsafe.Pointer(dx.Uintptr())))
 }
 
-// LRNCrossChannelForward performs the forward LRN layer computation.
-func (co *Context) LRNCrossChannelForward(normDesc *LRN, lrnMode LRNMode, alpha float64, xDesc *TensorDescriptor, x Memory, beta float64, yDesc *TensorDescriptor, y Memory) error {
-	// DOUBLECHECK: "cudnnLRNCrossChannelForward" returns Memory type in Parameter 8
+// PoolingForward computes pooling of input values (meaning, the maximum or average of several adjacent values) to produce an output with smaller height and/or width.
+func (co *Context) PoolingForward(poolingDesc *Pooling, alpha float64, xDesc *TensorDescriptor, x Memory, beta float64, yDesc *TensorDescriptor, y Memory) error {
+	// DOUBLECHECK: "cudnnPoolingForward" returns Memory type in Parameter 7
 	var alphaC, betaC unsafe.Pointer
 	switch xDesc.dataType {
 	case Float, Half:
@@ -518,15 +562,16 @@ func (co *Context) LRNCrossChannelForward(normDesc *LRN, lrnMode LRNMode, alpha
 	default:
 		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
 	}
-	// call cudnnLRNCrossChannelForward
-	return result(C.cudnnLRNCrossChannelForward(co.internal, normDesc.internal, lrnMode.C(), alphaC, xDesc.internal, x.Pointer(), betaC, yDesc.internal, y.Pointer()))
+	// call cudnnPoolingForward
+	return result(C.cudnnPoolingForward(co.internal, poolingDesc.internal, alphaC, xDesc.internal, unsafe.Pointer(x.Uintptr()), betaC, yDesc.internal, unsafe.Pointer(y.Uintptr())))
 }
 
-// LRNCrossChannelBackward performs the backward LRN layer computation.
-func (co *Context) LRNCrossChannelBackward(normDesc *LRN, lrnMode LRNMode, alpha float64, yDesc *TensorDescriptor, y Memory, dyDesc *TensorDescriptor, dy Memory, xDesc *TensorDescriptor, x Memory, beta float64, dxDesc *TensorDescriptor, dx Memory) error {
-	// DOUBLECHECK: "cudnnLRNCrossChannelBackward" returns Memory type in Parameter 12
+// ReduceTensor reduces tensor A by implementing the equation C = alpha * reduce op ( A ) + beta * C, given tensors A and C and scaling factors alpha and beta. The reduction op to use is indicated by the descriptor reduceTensorDesc. Currently-supported ops are listed by the ReduceTensorOp_t enum.
+//	C_ is both an input and output
+func (co *Context) ReduceTensor(reduceTensorDesc *Reduction, indices Memory, indicesSizeInBytes uintptr, workspace Memory, workspaceSizeInBytes uintptr, alpha float64, aDesc *TensorDescriptor, A Memory, beta float64, cDesc *TensorDescriptor, C_ Memory) error {
+	// DOUBLECHECK: "cudnnReduceTensor" returns Memory type in Parameter 2
 	var alphaC, betaC unsafe.Pointer
-	switch yDesc.dataType {
+	switch aDesc.dataType {
 	case Float, Half:
 		var alphaF, betaF C.float
 		alphaF = C.float(float32(alpha))
@@ -540,42 +585,37 @@ func (co *Context) LRNCrossChannelBackward(normDesc *LRN, lrnMode LRNMode, alpha
 		alphaC = unsafe.Pointer(&alphaF)
 		betaC = unsafe.Pointer(&betaF)
 	default:
-		return errors.Errorf("Unsupported data type: %v", yDesc.dataType)
+		return errors.Errorf("Unsupported data type: %v", aDesc.dataType)
 	}
-	// TODO: dxDesc cudnnTensorDescriptor_t
-	// call cudnnLRNCrossChannelBackward
-	return result(C.cudnnLRNCrossChannelBackward(co.internal, normDesc.internal, lrnMode.C(), alphaC, yDesc.internal, y.Pointer(), dyDesc.internal, dy.Pointer(), xDesc.internal, x.Pointer(), betaC, dxDesc.internal, dx.Pointer()))
+	// call cudnnReduceTensor
+	return result(C.cudnnReduceTensor(co.internal, reduceTensorDesc.internal, unsafe.Pointer(indices.Uintptr()), C.size_t(indicesSizeInBytes), unsafe.Pointer(workspace.Uintptr()), C.size_t(workspaceSizeInBytes), alphaC, aDesc.internal, unsafe.Pointer(A.Uintptr()), betaC, cDesc.internal, unsafe.Pointer(C_.Uintptr())))
 }
 
-// DivisiveNormalizationForward performs the forward spatial DivisiveNormalization layer computation. It divides every value in a layer by the standard deviation of it's spatial neighbors as described in `What is the Best Multi-Stage Architecture for Object Recognition`, Jarrett 2009, Local Contrast Normalization Layer section. Note that Divisive Normalization only implements the x/max(c, sigma_x) portion of the computation, where sigma_x is the variance over the spatial neighborhood of x. The full LCN (Local Contrastive Normalization) computation can be implemented as a two-step process:
-func (co *Context) DivisiveNormalizationForward(normDesc *LRN, mode DivNormMode, alpha float64, xDesc *TensorDescriptor, x Memory, means Memory, temp Memory, temp2 Memory, beta float64, yDesc *TensorDescriptor, y Memory) error {
-	// DOUBLECHECK: "cudnnDivisiveNormalizationForward" returns Memory type in Parameter 11
-	var alphaC, betaC unsafe.Pointer
-	switch xDesc.dataType {
+// ScaleTensor scales all the elements of a tensor by a given factor.
+//	y is both an input and output
+func (co *Context) ScaleTensor(yDesc *TensorDescriptor, y Memory, alpha float64) error {
+	var alphaC unsafe.Pointer
+	switch yDesc.dataType {
 	case Float, Half:
-		var alphaF, betaF C.float
+		var alphaF C.float
 		alphaF = C.float(float32(alpha))
-		betaF = C.float(float32(beta))
 		alphaC = unsafe.Pointer(&alphaF)
-		betaC = unsafe.Pointer(&betaF)
 	case Double:
-		var alphaF, betaF C.double
+		var alphaF C.double
 		alphaF = C.double(alpha)
-		betaF = C.double(beta)
 		alphaC = unsafe.Pointer(&alphaF)
-		betaC = unsafe.Pointer(&betaF)
 	default:
-		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
+		return errors.Errorf("Unsupported data type: %v", yDesc.dataType)
 	}
-	// call cudnnDivisiveNormalizationForward
-	return result(C.cudnnDivisiveNormalizationForward(co.internal, normDesc.internal, mode.C(), alphaC, xDesc.internal, x.Pointer(), means.Pointer(), temp.Pointer(), temp2.Pointer(), betaC, yDesc.internal, y.Pointer()))
+	// call cudnnScaleTensor
+	return result(C.cudnnScaleTensor(co.internal, yDesc.internal, unsafe.Pointer(y.Uintptr()), alphaC))
 }
 
-// DivisiveNormalizationBackward performs the backward DivisiveNormalization layer computation.
-func (co *Context) DivisiveNormalizationBackward(normDesc *LRN, mode DivNormMode, alpha float64, xDesc *TensorDescriptor, x Memory, means Memory, dy Memory, temp Memory, temp2 Memory, beta float64, dXdMeansDesc *TensorDescriptor, dx Memory, dMeans Memory) error {
-	// DOUBLECHECK: "cudnnDivisiveNormalizationBackward" returns Memory type in Parameter 13
+// SoftmaxBackward computes the gradient of the softmax function.
+func (co *Context) SoftmaxBackward(algo SoftmaxAlgorithm, mode SoftmaxMode, alpha float64, yDesc *TensorDescriptor, y Memory, dyDesc *TensorDescriptor, dy Memory, beta float64, dxDesc *TensorDescriptor, dx Memory) error {
+	// DOUBLECHECK: "cudnnSoftmaxBackward" returns Memory type in Parameter 10
 	var alphaC, betaC unsafe.Pointer
-	switch xDesc.dataType {
+	switch yDesc.dataType {
 	case Float, Half:
 		var alphaF, betaF C.float
 		alphaF = C.float(float32(alpha))
@@ -589,15 +629,15 @@ func (co *Context) DivisiveNormalizationBackward(normDesc *LRN, mode DivNormMode
 		alphaC = unsafe.Pointer(&alphaF)
 		betaC = unsafe.Pointer(&betaF)
 	default:
-		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
+		return errors.Errorf("Unsupported data type: %v", yDesc.dataType)
 	}
-	// call cudnnDivisiveNormalizationBackward
-	return result(C.cudnnDivisiveNormalizationBackward(co.internal, normDesc.internal, mode.C(), alphaC, xDesc.internal, x.Pointer(), means.Pointer(), dy.Pointer(), temp.Pointer(), temp2.Pointer(), betaC, dXdMeansDesc.internal, dx.Pointer(), dMeans.Pointer()))
+	// call cudnnSoftmaxBackward
+	return result(C.cudnnSoftmaxBackward(co.internal, algo.C(), mode.C(), alphaC, yDesc.internal, unsafe.Pointer(y.Uintptr()), dyDesc.internal, unsafe.Pointer(dy.Uintptr()), betaC, dxDesc.internal, unsafe.Pointer(dx.Uintptr())))
 }
 
-// BatchNormalizationForwardTraining performs the forward BatchNormalization layer computation for training phase.
-func (co *Context) BatchNormalizationForwardTraining(mode BatchNormMode, alpha float64, beta float64, xDesc *TensorDescriptor, x Memory, yDesc *TensorDescriptor, y Memory, bnScaleBiasMeanVarDesc *TensorDescriptor, bnScale Memory, bnBias Memory, exponentialAverageFactor float64, resultRunningMean Memory, resultRunningVariance Memory, epsilon float64, resultSaveMean Memory, resultSaveInvVariance Memory) error {
-	// DOUBLECHECK: "cudnnBatchNormalizationForwardTraining" returns Memory type in Parameter 16
+// SoftmaxForward computes the softmax function.
+func (co *Context) SoftmaxForward(algo SoftmaxAlgorithm, mode SoftmaxMode, alpha float64, xDesc *TensorDescriptor, x Memory, beta float64, yDesc *TensorDescriptor, y Memory) error {
+	// DOUBLECHECK: "cudnnSoftmaxForward" returns Memory type in Parameter 8
 	var alphaC, betaC unsafe.Pointer
 	switch xDesc.dataType {
 	case Float, Half:
@@ -615,12 +655,27 @@ func (co *Context) BatchNormalizationForwardTraining(mode BatchNormMode, alpha f
 	default:
 		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
 	}
-	// call cudnnBatchNormalizationForwardTraining
-	return result(C.cudnnBatchNormalizationForwardTraining(co.internal, mode.C(), alphaC, betaC, xDesc.internal, x.Pointer(), yDesc.internal, y.Pointer(), bnScaleBiasMeanVarDesc.internal, bnScale.Pointer(), bnBias.Pointer(), C.double(exponentialAverageFactor), resultRunningMean.Pointer(), resultRunningVariance.Pointer(), C.double(epsilon), resultSaveMean.Pointer(), resultSaveInvVariance.Pointer()))
+	// call cudnnSoftmaxForward
+	return result(C.cudnnSoftmaxForward(co.internal, algo.C(), mode.C(), alphaC, xDesc.internal, unsafe.Pointer(x.Uintptr()), betaC, yDesc.internal, unsafe.Pointer(y.Uintptr())))
 }
 
-// BatchNormalizationForwardInference performs the forward BatchNormalization layer computation for inference phase. BatchNormalizationForwardInference layer is based on the paper `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift`, S. Ioffe, C. Szegedy, 2015.
-func (co *Context) BatchNormalizationForwardInference(mode BatchNormMode, alpha float64, beta float64, xDesc *TensorDescriptor, x Memory, yDesc *TensorDescriptor, y Memory, bnScaleBiasMeanVarDesc *TensorDescriptor, bnScale Memory, bnBias Memory, estimatedMean Memory, estimatedVariance Memory, epsilon float64) error {
+// SpatialTfGridGeneratorBackward computes the gradient of a grid generation operation.
+func (co *Context) SpatialTfGridGeneratorBackward(stDesc *SpatialTransformer, dgrid Memory, dtheta Memory) error {
+	// DOUBLECHECK: "cudnnSpatialTfGridGeneratorBackward" returns Memory type in Parameter 3
+	// call cudnnSpatialTfGridGeneratorBackward
+	return result(C.cudnnSpatialTfGridGeneratorBackward(co.internal, stDesc.internal, unsafe.Pointer(dgrid.Uintptr()), unsafe.Pointer(dtheta.Uintptr())))
+}
+
+// SpatialTfGridGeneratorForward generates a grid of coordinates in the input tensor corresponding to each pixel from the output tensor.
+func (co *Context) SpatialTfGridGeneratorForward(stDesc *SpatialTransformer, theta Memory, grid Memory) error {
+	// DOUBLECHECK: "cudnnSpatialTfGridGeneratorForward" returns Memory type in Parameter 3
+	// call cudnnSpatialTfGridGeneratorForward
+	return result(C.cudnnSpatialTfGridGeneratorForward(co.internal, stDesc.internal, unsafe.Pointer(theta.Uintptr()), unsafe.Pointer(grid.Uintptr())))
+}
+
+// SpatialTfSamplerBackward computes the gradient of a sampling operation.
+func (co *Context) SpatialTfSamplerBackward(stDesc *SpatialTransformer, alpha float64, xDesc *TensorDescriptor, x Memory, beta float64, dxDesc *TensorDescriptor, dx Memory, alphaDgrid Memory, dyDesc *TensorDescriptor, dy Memory, grid Memory, betaDgrid Memory, dgrid Memory) error {
+	// DOUBLECHECK: "cudnnSpatialTfSamplerBackward" returns Memory type in Parameter 13
 	var alphaC, betaC unsafe.Pointer
 	switch xDesc.dataType {
 	case Float, Half:
@@ -638,53 +693,8 @@ func (co *Context) BatchNormalizationForwardInference(mode BatchNormMode, alpha
 	default:
 		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
 	}
-	// call cudnnBatchNormalizationForwardInference
-	return result(C.cudnnBatchNormalizationForwardInference(co.internal, mode.C(), alphaC, betaC, xDesc.internal, x.Pointer(), yDesc.internal, y.Pointer(), bnScaleBiasMeanVarDesc.internal, bnScale.Pointer(), bnBias.Pointer(), estimatedMean.Pointer(), estimatedVariance.Pointer(), C.double(epsilon)))
-}
-
-// BatchNormalizationBackward performs the backward BatchNormalization layer computation.
-func (co *Context) BatchNormalizationBackward(mode BatchNormMode, alphaDataDiff float64, betaDataDiff float64, alphaParamDiff float64, betaParamDiff float64, xDesc *TensorDescriptor, x Memory, dyDesc *TensorDescriptor, dy Memory, dxDesc *TensorDescriptor, dx Memory, dBnScaleBiasDesc *TensorDescriptor, bnScale Memory, dBnScaleResult Memory, dBnBiasResult Memory, epsilon float64, savedMean Memory, savedInvVariance Memory) error {
-	var alphaDataDiffC, betaDataDiffC, alphaParamDiffC, betaParamDiffC unsafe.Pointer
-	switch xDesc.dataType {
-	case Float, Half:
-		var alphaDataDiffF, betaDataDiffF, alphaParamDiffF, betaParamDiffF C.float
-		alphaDataDiffF = C.float(float32(alphaDataDiff))
-		betaDataDiffF = C.float(float32(betaDataDiff))
-		alphaParamDiffF = C.float(float32(alphaParamDiff))
-		betaParamDiffF = C.float(float32(betaParamDiff))
-		alphaDataDiffC = unsafe.Pointer(&alphaDataDiffF)
-		betaDataDiffC = unsafe.Pointer(&betaDataDiffF)
-		alphaParamDiffC = unsafe.Pointer(&alphaParamDiffF)
-		betaParamDiffC = unsafe.Pointer(&betaParamDiffF)
-	case Double:
-		var alphaDataDiffF, betaDataDiffF, alphaParamDiffF, betaParamDiffF C.double
-		alphaDataDiffF = C.double(alphaDataDiff)
-		betaDataDiffF = C.double(betaDataDiff)
-		alphaParamDiffF = C.double(alphaParamDiff)
-		betaParamDiffF = C.double(betaParamDiff)
-		alphaDataDiffC = unsafe.Pointer(&alphaDataDiffF)
-		betaDataDiffC = unsafe.Pointer(&betaDataDiffF)
-		alphaParamDiffC = unsafe.Pointer(&alphaParamDiffF)
-		betaParamDiffC = unsafe.Pointer(&betaParamDiffF)
-	default:
-		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
-	}
-	// call cudnnBatchNormalizationBackward
-	return result(C.cudnnBatchNormalizationBackward(co.internal, mode.C(), alphaDataDiffC, betaDataDiffC, alphaParamDiffC, betaParamDiffC, xDesc.internal, x.Pointer(), dyDesc.internal, dy.Pointer(), dxDesc.internal, dx.Pointer(), dBnScaleBiasDesc.internal, bnScale.Pointer(), dBnScaleResult.Pointer(), dBnBiasResult.Pointer(), C.double(epsilon), savedMean.Pointer(), savedInvVariance.Pointer()))
-}
-
-// SpatialTfGridGeneratorForward generates a grid of coordinates in the input tensor corresponding to each pixel from the output tensor.
-func (co *Context) SpatialTfGridGeneratorForward(stDesc *SpatialTransformer, theta Memory, grid Memory) error {
-	// DOUBLECHECK: "cudnnSpatialTfGridGeneratorForward" returns Memory type in Parameter 3
-	// call cudnnSpatialTfGridGeneratorForward
-	return result(C.cudnnSpatialTfGridGeneratorForward(co.internal, stDesc.internal, theta.Pointer(), grid.Pointer()))
-}
-
-// SpatialTfGridGeneratorBackward computes the gradient of a grid generation operation.
-func (co *Context) SpatialTfGridGeneratorBackward(stDesc *SpatialTransformer, dgrid Memory, dtheta Memory) error {
-	// DOUBLECHECK: "cudnnSpatialTfGridGeneratorBackward" returns Memory type in Parameter 3
-	// call cudnnSpatialTfGridGeneratorBackward
-	return result(C.cudnnSpatialTfGridGeneratorBackward(co.internal, stDesc.internal, dgrid.Pointer(), dtheta.Pointer()))
+	// call cudnnSpatialTfSamplerBackward
+	return result(C.cudnnSpatialTfSamplerBackward(co.internal, stDesc.internal, alphaC, xDesc.internal, unsafe.Pointer(x.Uintptr()), betaC, dxDesc.internal, unsafe.Pointer(dx.Uintptr()), unsafe.Pointer(alphaDgrid.Uintptr()), dyDesc.internal, unsafe.Pointer(dy.Uintptr()), unsafe.Pointer(grid.Uintptr()), unsafe.Pointer(betaDgrid.Uintptr()), unsafe.Pointer(dgrid.Uintptr())))
 }
 
 // SpatialTfSamplerForward performs a sampler operation and generates the output tensor using the grid given by the grid generator.
@@ -708,12 +718,12 @@ func (co *Context) SpatialTfSamplerForward(stDesc *SpatialTransformer, alpha flo
 		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
 	}
 	// call cudnnSpatialTfSamplerForward
-	return result(C.cudnnSpatialTfSamplerForward(co.internal, stDesc.internal, alphaC, xDesc.internal, x.Pointer(), grid.Pointer(), betaC, yDesc.internal, y.Pointer()))
+	return result(C.cudnnSpatialTfSamplerForward(co.internal, stDesc.internal, alphaC, xDesc.internal, unsafe.Pointer(x.Uintptr()), unsafe.Pointer(grid.Uintptr()), betaC, yDesc.internal, unsafe.Pointer(y.Uintptr())))
 }
 
-// SpatialTfSamplerBackward computes the gradient of a sampling operation.
-func (co *Context) SpatialTfSamplerBackward(stDesc *SpatialTransformer, alpha float64, xDesc *TensorDescriptor, x Memory, beta float64, dxDesc *TensorDescriptor, dx Memory, alphaDgrid Memory, dyDesc *TensorDescriptor, dy Memory, grid Memory, betaDgrid Memory, dgrid Memory) error {
-	// DOUBLECHECK: "cudnnSpatialTfSamplerBackward" returns Memory type in Parameter 13
+// TransformTensor copies the scaled data from one tensor to another tensor with a different layout. Those descriptors need to have the same dimensions but not necessarily the same strides. The input and output tensors must not overlap in any way (meaning, tensors cannot be transformed in place). TransformTensor can be used to convert a tensor with an unsupported format to a supported one.
+func (co *Context) TransformTensor(alpha float64, xDesc *TensorDescriptor, x Memory, beta float64, yDesc *TensorDescriptor, y Memory) error {
+	// DOUBLECHECK: "cudnnTransformTensor" returns Memory type in Parameter 6
 	var alphaC, betaC unsafe.Pointer
 	switch xDesc.dataType {
 	case Float, Half:
@@ -731,208 +741,19 @@ func (co *Context) SpatialTfSamplerBackward(stDesc *SpatialTransformer, alpha fl
 	default:
 		return errors.Errorf("Unsupported data type: %v", xDesc.dataType)
 	}
-	// call cudnnSpatialTfSamplerBackward
-	return result(C.cudnnSpatialTfSamplerBackward(co.internal, stDesc.internal, alphaC, xDesc.internal, x.Pointer(), betaC, dxDesc.internal, dx.Pointer(), alphaDgrid.Pointer(), dyDesc.internal, dy.Pointer(), grid.Pointer(), betaDgrid.Pointer(), dgrid.Pointer()))
-}
-
-// DropoutGetStatesSize is used to query the amount of space required to store the states of the random number generators used by cudnnDropoutForward function.
-func (co *Context) DropoutGetStatesSize() (sizeInBytes uintptr, err error) {
-	var sizeInBytesC C.size_t
-	// call cudnnDropoutGetStatesSize
-	err = result(C.cudnnDropoutGetStatesSize(co.internal, &sizeInBytesC))
-	sizeInBytes = uintptr(sizeInBytesC)
-	return
-}
-
-// DropoutForward performs forward dropout operation over x returning results in y. If dropout was used as a parameter to cudnnSetDropoutDescriptor, the approximately dropout fraction of x values will be replaces by 0, and the rest will be scaled by 1/(1-dropout) DropoutForward should not be running concurrently with another DropoutForward function using the same states.
-func (co *Context) DropoutForward(dropoutDesc *Dropout, xdesc *TensorDescriptor, x Memory, ydesc *TensorDescriptor, y Memory, reserveSpace Memory, reserveSpaceSizeInBytes uintptr) error {
-	// DOUBLECHECK: "cudnnDropoutForward" returns Memory type in Parameter 6
-	// call cudnnDropoutForward
-	return result(C.cudnnDropoutForward(co.internal, dropoutDesc.internal, xdesc.internal, x.Pointer(), ydesc.internal, y.Pointer(), reserveSpace.Pointer(), C.size_t(reserveSpaceSizeInBytes)))
-}
-
-// DropoutBackward performs backward dropout operation over dy returning results in dx. If during forward dropout operation value from x was propagated to y then during backward operation value from dy will be propagated to dx, otherwise, dx value will be set to 0.
-func (co *Context) DropoutBackward(dropoutDesc *Dropout, dydesc *TensorDescriptor, dy Memory, dxdesc *TensorDescriptor, dx Memory, reserveSpace Memory, reserveSpaceSizeInBytes uintptr) error {
-	// DOUBLECHECK: "cudnnDropoutBackward" returns Memory type in Parameter 5
-	// call cudnnDropoutBackward
-	return result(C.cudnnDropoutBackward(co.internal, dropoutDesc.internal, dydesc.internal, dy.Pointer(), dxdesc.internal, dx.Pointer(), reserveSpace.Pointer(), C.size_t(reserveSpaceSizeInBytes)))
-}
-
-// GetRNNWorkspaceSize is used to query the amount of work space required to execute the RNN described by rnnDesc with inputs dimensions defined by xDesc.
-func (co *Context) GetRNNWorkspaceSize(rnnDesc *RNN, seqLength int, xDesc []*TensorDescriptor) (sizeInBytes uintptr, err error) {
-	var sizeInBytesC C.size_t
-	if len(xDesc) != seqLength {
-		return 0, errors.Errorf("Incorrect xDesc length. Want %d. Got %d", seqLength, len(xDesc))
-	}
-
-	internals := make([]C.cudnnTensorDescriptor_t, len(xDesc))
-	for i := range xDesc {
-		internals[i] = xDesc[i].internal
-	}
-	ptr := (*C.cudnnTensorDescriptor_t)(unsafe.Pointer(&internals[0]))
-
-	// call cudnnGetRNNWorkspaceSize
-	err = result(C.cudnnGetRNNWorkspaceSize(co.internal, rnnDesc.internal, C.int(seqLength), ptr, &sizeInBytesC))
-	sizeInBytes = uintptr(sizeInBytesC)
-	return
-}
-
-// GetRNNTrainingReserveSize is used to query the amount of reserved space required for training the RNN described by rnnDesc with inputs dimensions defined by xDesc. The same reserved space buffer must be passed to cudnnRNNForwardTraining, cudnnRNNBackwardData and cudnnRNNBackwardWeights. Each of these calls overwrites the contents of the reserved space, however it can safely be backed up and restored between calls if reuse of the memory is desired.
-func (co *Context) GetRNNTrainingReserveSize(rnnDesc *RNN, seqLength int, xDesc []*TensorDescriptor) (sizeInBytes uintptr, err error) {
-	var sizeInBytesC C.size_t
-	if len(xDesc) != seqLength {
-		return 0, errors.Errorf("Incorrect xDesc length. Want %d. Got %d", seqLength, len(xDesc))
-	}
-
-	internals := make([]C.cudnnTensorDescriptor_t, len(xDesc))
-	for i := range xDesc {
-		internals[i] = xDesc[i].internal
-	}
-	ptr := (*C.cudnnTensorDescriptor_t)(unsafe.Pointer(&internals[0]))
-
-	// call cudnnGetRNNTrainingReserveSize
-	err = result(C.cudnnGetRNNTrainingReserveSize(co.internal, rnnDesc.internal, C.int(seqLength), ptr, &sizeInBytesC))
-	sizeInBytes = uintptr(sizeInBytesC)
-	return
+	// call cudnnTransformTensor
+	return result(C.cudnnTransformTensor(co.internal, alphaC, xDesc.internal, unsafe.Pointer(x.Uintptr()), betaC, yDesc.internal, unsafe.Pointer(y.Uintptr())))
 }
 
-// GetRNNParamsSize is used to query the amount of parameter space required to execute the RNN described by rnnDesc with inputs dimensions defined by xDesc.
-func (co *Context) GetRNNParamsSize(rnnDesc *RNN, xDesc *TensorDescriptor, dataType DataType) (sizeInBytes uintptr, err error) {
-	var sizeInBytesC C.size_t
-	// call cudnnGetRNNParamsSize
-	err = result(C.cudnnGetRNNParamsSize(co.internal, rnnDesc.internal, xDesc.internal, &sizeInBytesC, dataType.C()))
-	sizeInBytes = uintptr(sizeInBytesC)
+// DeriveBNTensorDescriptor derives a secondary tensor descriptor for the batch normalization scale, invVariance, bnBias, and bnScale subtensors from the layer's x data descriptor.
+func (te *TensorDescriptor) DeriveBNTensorDescriptor(xDesc *TensorDescriptor, mode BatchNormMode) (derivedBnDesc *TensorDescriptor, err error) {
+	// TODO: xDesc cudnnTensorDescriptor_t
+	// call cudnnDeriveBNTensorDescriptor
+	err = result(C.cudnnDeriveBNTensorDescriptor(te.internal, xDesc.internal, mode.C()))
 	return
 }
 
-// // GetRNNLinLayerMatrixParams is used to obtain a pointer and a descriptor of every RNN weight matrix in each pseudo-layer within the recurrent network defined by rnnDesc and its input width specified in xDesc.
-// func (co *Context) GetRNNLinLayerMatrixParams(rnnDesc *RNN, layer int, xDesc *TensorDescriptor, wDesc *Filter, w Memory, linLayerID int) (linLayerMatDesc *Filter, linLayerMat TODO, err error) {
-// 	// TODO: linLayerMatDesc cudnnFilterDescriptor_t
-// 	// TODO: linLayerMat void**
-// 	// call cudnnGetRNNLinLayerMatrixParams
-// 	err = result(C.cudnnGetRNNLinLayerMatrixParams(co.internal, rnnDesc.internal, C.int(layer), xDesc.internal, wDesc.internal, w.Pointer(), C.int(linLayerID), linLayerMatDesc.internal, linLayerMat))
-// 	return
-// }
-
-// // GetRNNLinLayerBiasParams is used to obtain a pointer and a descriptor of every RNN bias column vector in each pseudo-layer within the recurrent network defined by rnnDesc and its input width specified in xDesc.
-// func (co *Context) GetRNNLinLayerBiasParams(rnnDesc *RNN, layer int, xDesc *TensorDescriptor, wDesc *Filter, w Memory, linLayerID int) (linLayerBiasDesc *Filter, linLayerBias TODO, err error) {
-// 	// TODO: linLayerBiasDesc cudnnFilterDescriptor_t
-// 	// TODO: linLayerBias void**
-// 	// call cudnnGetRNNLinLayerBiasParams
-// 	err = result(C.cudnnGetRNNLinLayerBiasParams(co.internal, rnnDesc.internal, C.int(layer), xDesc.internal, wDesc.internal, w.Pointer(), C.int(linLayerID), linLayerBiasDesc.internal, linLayerBias))
-// 	return
-// }
-
-// RNNForwardInference executes the recurrent neural network described by rnnDesc with inputs x, hx, cx, weights w and outputs y, hy, cy. workspace is required for intermediate storage. RNNForwardInference does not store intermediate data required for training; cudnnRNNForwardTraining should be used for that purpose.
-func (co *Context) RNNForwardInference(rnnDesc *RNN, seqLength int, xDesc []*TensorDescriptor, x Memory, hxDesc *TensorDescriptor, hx Memory, cxDesc *TensorDescriptor, cx Memory, wDesc *Filter, w Memory, yDesc []*TensorDescriptor, y Memory, hyDesc *TensorDescriptor, hy Memory, cyDesc *TensorDescriptor, cy Memory, workspace Memory, workSpaceSizeInBytes uintptr) error {
-	// DOUBLECHECK: "cudnnRNNForwardInference" returns Memory type in Parameter 16
-	if len(xDesc) != seqLength {
-		return errors.Errorf("Incorrect xDesc length. Want %d. Got %d", seqLength, len(xDesc))
-	}
-
-	internals := make([]C.cudnnTensorDescriptor_t, len(xDesc))
-	for i := range xDesc {
-		internals[i] = xDesc[i].internal
-	}
-	xDescPtr := (*C.cudnnTensorDescriptor_t)(unsafe.Pointer(&internals[0]))
-
-	yDescInternals := make([]C.cudnnTensorDescriptor_t, len(xDesc))
-	for i := range yDesc {
-		yDescInternals[i] = yDesc[i].internal
-	}
-	yDescPtr := (*C.cudnnTensorDescriptor_t)(unsafe.Pointer(&yDescInternals[0]))
-
-	// call cudnnRNNForwardInference
-	return result(C.cudnnRNNForwardInference(co.internal, rnnDesc.internal, C.int(seqLength), xDescPtr, x.Pointer(), hxDesc.internal, hx.Pointer(), cxDesc.internal, cx.Pointer(), wDesc.internal, w.Pointer(), yDescPtr, y.Pointer(), hyDesc.internal, hy.Pointer(), cyDesc.internal, cy.Pointer(), workspace.Pointer(), C.size_t(workSpaceSizeInBytes)))
-}
-
-// RNNForwardTraining executes the recurrent neural network described by rnnDesc with inputs x, hx, cx, weights w and outputs y, hy, cy. workspace is required for intermediate storage. reserveSpace stores data required for training. The same reserveSpace data must be used for future calls to cudnnRNNBackwardData and cudnnRNNBackwardWeights if these execute on the same input data.
-//	reserveSpace is both an input and output
-func (co *Context) RNNForwardTraining(rnnDesc *RNN, seqLength int, xDesc []*TensorDescriptor, x Memory, hxDesc *TensorDescriptor, hx Memory, cxDesc *TensorDescriptor, cx Memory, wDesc *Filter, w Memory, yDesc []*TensorDescriptor, y Memory, hyDesc *TensorDescriptor, hy Memory, cyDesc *TensorDescriptor, cy Memory, workspace Memory, workSpaceSizeInBytes uintptr, reserveSpace Memory, reserveSpaceSizeInBytes uintptr) error {
-	// DOUBLECHECK: "cudnnRNNForwardTraining" returns Memory type in Parameter 16
-	internals := make([]C.cudnnTensorDescriptor_t, len(xDesc))
-	for i := range xDesc {
-		internals[i] = xDesc[i].internal
-	}
-	ptr := (*C.cudnnTensorDescriptor_t)(unsafe.Pointer(&internals[0]))
-
-	yDescInternals := make([]C.cudnnTensorDescriptor_t, len(xDesc))
-	for i := range yDesc {
-		yDescInternals[i] = yDesc[i].internal
-	}
-	yDescPtr := (*C.cudnnTensorDescriptor_t)(unsafe.Pointer(&yDescInternals[0]))
-	// call cudnnRNNForwardTraining
-	return result(C.cudnnRNNForwardTraining(co.internal, rnnDesc.internal, C.int(seqLength), ptr, x.Pointer(), hxDesc.internal, hx.Pointer(), cxDesc.internal, cx.Pointer(), wDesc.internal, w.Pointer(), yDescPtr, y.Pointer(), hyDesc.internal, hy.Pointer(), cyDesc.internal, cy.Pointer(), workspace.Pointer(), C.size_t(workSpaceSizeInBytes), reserveSpace.Pointer(), C.size_t(reserveSpaceSizeInBytes)))
-}
-
-// RNNBackwardData executes the recurrent neural network described by rnnDesc with output gradients dy, dhy, dhc, weights w and input gradients dx, dhx, dcx. workspace is required for intermediate storage. The data in reserveSpace must have previously been generated by cudnnRNNForwardTraining. The same reserveSpace data must be used for future calls to cudnnRNNBackwardWeights if they execute on the same input data.
-//	reserveSpace is both an input and output
-func (co *Context) RNNBackwardData(rnnDesc *RNN, seqLength int, yDesc []*TensorDescriptor, y Memory, dyDesc []*TensorDescriptor, dy Memory, dhyDesc *TensorDescriptor, dhy Memory, dcyDesc *TensorDescriptor, dcy Memory, wDesc *Filter, w Memory, hxDesc *TensorDescriptor, hx Memory, cxDesc *TensorDescriptor, cx Memory, dxDesc []*TensorDescriptor, dx Memory, dhxDesc *TensorDescriptor, dhx Memory, dcxDesc *TensorDescriptor, dcx Memory, workspace Memory, workSpaceSizeInBytes uintptr, reserveSpace Memory, reserveSpaceSizeInBytes uintptr) error {
-	// DOUBLECHECK: "cudnnRNNBackwardData" returns Memory type in Parameter 22
-	internals := make([]C.cudnnTensorDescriptor_t, len(yDesc))
-	for i := range yDesc {
-		internals[i] = yDesc[i].internal
-	}
-	ptr := (*C.cudnnTensorDescriptor_t)(unsafe.Pointer(&internals[0]))
-
-	dyInternals := make([]C.cudnnTensorDescriptor_t, len(dyDesc))
-	for i := range dyDesc {
-		dyInternals[i] = dyDesc[i].internal
-	}
-	dyPtr := (*C.cudnnTensorDescriptor_t)(unsafe.Pointer(&dyInternals[0]))
-
-	dxInternals := make([]C.cudnnTensorDescriptor_t, len(dxDesc))
-	for i := range dyDesc {
-		dxInternals[i] = dxDesc[i].internal
-	}
-	dxPtr := (*C.cudnnTensorDescriptor_t)(unsafe.Pointer(&dxInternals[0]))
-
-	// call cudnnRNNBackwardData
-	return result(C.cudnnRNNBackwardData(co.internal, rnnDesc.internal, C.int(seqLength), ptr, y.Pointer(), dyPtr, dy.Pointer(), dhyDesc.internal, dhy.Pointer(), dcyDesc.internal, dcy.Pointer(), wDesc.internal, w.Pointer(), hxDesc.internal, hx.Pointer(), cxDesc.internal, cx.Pointer(), dxPtr, dx.Pointer(), dhxDesc.internal, dhx.Pointer(), dcxDesc.internal, dcx.Pointer(), workspace.Pointer(), C.size_t(workSpaceSizeInBytes), reserveSpace.Pointer(), C.size_t(reserveSpaceSizeInBytes)))
-}
-
-// RNNBackwardWeights accumulates weight gradients dw from the recurrent neural network described by rnnDesc with inputs x, hx, and outputs y. The mode of operation in this case is additive, the weight gradients calculated will be added to those already existing in dw. workspace is required for intermediate storage. The data in reserveSpace must have previously been generated by cudnnRNNBackwardData.
-//	dw is both an input and output
-func (co *Context) RNNBackwardWeights(rnnDesc *RNN, seqLength int, xDesc []*TensorDescriptor, x Memory, hxDesc *TensorDescriptor, hx Memory, yDesc []*TensorDescriptor, y Memory, workspace Memory, workSpaceSizeInBytes uintptr, dwDesc *Filter, dw Memory, reserveSpace Memory, reserveSpaceSizeInBytes uintptr) error {
-	internals := make([]C.cudnnTensorDescriptor_t, len(xDesc))
-	for i := range xDesc {
-		internals[i] = xDesc[i].internal
-	}
-	ptr := (*C.cudnnTensorDescriptor_t)(unsafe.Pointer(&internals[0]))
-
-	yDescInternals := make([]C.cudnnTensorDescriptor_t, len(xDesc))
-	for i := range yDesc {
-		yDescInternals[i] = yDesc[i].internal
-	}
-	yDescPtr := (*C.cudnnTensorDescriptor_t)(unsafe.Pointer(&yDescInternals[0]))
-
-	// call cudnnRNNBackwardWeights
-	return result(C.cudnnRNNBackwardWeights(co.internal, rnnDesc.internal, C.int(seqLength), ptr, x.Pointer(), hxDesc.internal, hx.Pointer(), yDescPtr, y.Pointer(), workspace.Pointer(), C.size_t(workSpaceSizeInBytes), dwDesc.internal, dw.Pointer(), reserveSpace.Pointer(), C.size_t(reserveSpaceSizeInBytes)))
-}
-
-// CTCLoss returns the ctc costs and gradients, given the probabilities and labels.
-func (co *Context) CTCLoss(probsDesc *TensorDescriptor, probs Memory, labels []int, labelLengths []int, inputLengths []int, costs Memory, gradientsDesc *TensorDescriptor, gradients Memory, algo CTCLossAlgo, ctcLossDesc *CTCLoss, workspace Memory, workSpaceSizeInBytes uintptr) error {
-	// DOUBLECHECK: "cudnnCTCLoss" returns Memory type in Parameter 8
-	labelsPtr, labelsPtrManaged := ints2CIntPtr(labels)
-	defer returnManaged(labelsPtrManaged)
-	labelLengthsPtr, labelLengthsPtrManaged := ints2CIntPtr(labelLengths)
-	defer returnManaged(labelLengthsPtrManaged)
-	inputLengthsPtr, inputLengthsPtrManaged := ints2CIntPtr(inputLengths)
-	defer returnManaged(inputLengthsPtrManaged)
-
-	// call cudnnCTCLoss
-	return result(C.cudnnCTCLoss(co.internal, probsDesc.internal, probs.Pointer(), labelsPtr, labelLengthsPtr, inputLengthsPtr, costs.Pointer(), gradientsDesc.internal, gradients.Pointer(), algo.C(), ctcLossDesc.internal, workspace.Pointer(), C.size_t(workSpaceSizeInBytes)))
-}
-
-// // Derives a secondary tensor descriptor for BatchNormalization scale, invVariance, bnBias, bnScale subtensors from the layer's x data descriptor. Use the tensor descriptor produced by this function as the bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc parameters in Spatial and Per-Activation Batch Normalization forward and backward functions. Resulting dimensions will be 1xC(x1)x1x1 for BATCHNORM_MODE_SPATIAL and 1xC(xD)xHxW for BATCHNORM_MODE_PER_ACTIVATION (parentheses for 5D). For HALF input data type the resulting tensor descriptor will have a FLOAT type. For other data types it will have the same type as the input data.
-// func (te *TensorDescriptor) DeriveBNTensorDescriptor(xDesc *TensorDescriptor, mode BatchNormMode) (derivedBnDesc *TensorDescriptor, err error) {
-// 	// TODO
-// 	// call cudnnDeriveBNTensorDescriptor
-// 	err = result(C.cudnnDeriveBNTensorDescriptor(te.internal, xDesc.internal, mode.C()))
-// 	return
-// }
-
-// DropoutGetReserveSpaceSize is used to query the amount of reserve needed to run dropout with the input dimensions given by xDesc. The same reserve space is expected to be passed to cudnnDropoutForward and cudnnDropoutBackward, and its contents is expected to remain unchanged between cudnnDropoutForward and cudnnDropoutBackward calls.
+// DropoutGetReserveSpaceSize is used to query the amount of reserve needed to run dropout with the input dimensions given by xDesc. The same reserve space is expected to be passed to cudnnDropoutForward() and cudnnDropoutBackward(), and its contents is expected to remain unchanged between cudnnDropoutForward() and cudnnDropoutBackward() calls.
 func (te *TensorDescriptor) DropoutGetReserveSpaceSize() (sizeInBytes uintptr, err error) {
 	var sizeInBytesC C.size_t
 	// call cudnnDropoutGetReserveSpaceSize
@@ -940,3 +761,9 @@ func (te *TensorDescriptor) DropoutGetReserveSpaceSize() (sizeInBytes uintptr, e
 	sizeInBytes = uintptr(sizeInBytesC)
 	return
 }
+
+// RestoreDropoutDescriptor restores a dropout descriptor to a previously saved-off state.
+func (dr *Dropout) RestoreDropoutDescriptor(handle *Context, dropout float32, states Memory, stateSizeInBytes uintptr, seed uint64) error {
+	// call cudnnRestoreDropoutDescriptor
+	return result(C.cudnnRestoreDropoutDescriptor(dr.internal, handle.internal, C.float(dropout), unsafe.Pointer(states.Uintptr()), C.size_t(stateSizeInBytes), C.ulonglong(seed)))
+}
diff --git a/dnn/generated_activation.go b/dnn/generated_activation.go
index 6da8575..26346a8 100644
--- a/dnn/generated_activation.go
+++ b/dnn/generated_activation.go
@@ -36,6 +36,9 @@ func NewActivation(mode ActivationMode, reluNanOpt NanPropagation, coef float64)
 	return retVal, nil
 }
 
+// C returns the cgo representation.
+func (a *Activation) C() C.cudnnActivationDescriptor_t { return a.internal }
+
 // Mode returns the internal mode.
 func (a *Activation) Mode() ActivationMode { return a.mode }
 
diff --git a/dnn/generated_algorithmdescriptor.go b/dnn/generated_algorithmdescriptor.go
new file mode 100644
index 0000000..51e8b4d
--- /dev/null
+++ b/dnn/generated_algorithmdescriptor.go
@@ -0,0 +1,67 @@
+package cudnn
+
+/* WAS Generated by gencudnn. DO NOT EDIT */
+
+// #include <cudnn.h>
+// #include "algorithm.h"
+import "C"
+import (
+	"fmt"
+	"runtime"
+)
+
+type Algorithm interface{}
+
+// AlgorithmDescriptor is a representation of cudnnAlgorithmDescriptor_t.
+type AlgorithmDescriptor struct {
+	internal C.cudnnAlgorithmDescriptor_t
+
+	algorithm Algorithm
+}
+
+// NewAlgorithmDescriptor creates a new AlgorithmDescriptor.
+func NewAlgorithmDescriptor(algorithm Algorithm) (retVal *AlgorithmDescriptor, err error) {
+	var internal C.cudnnAlgorithmDescriptor_t
+	if err := result(C.cudnnCreateAlgorithmDescriptor(&internal)); err != nil {
+		return nil, err
+	}
+
+	switch a := algorithm.(type) {
+	case ConvolutionFwdAlgo:
+		if err := result(C.cudnnSetAlgorithmDescriptor(internal, C.makeConvFwdAlgo(a.C()))); err != nil {
+			return nil, err
+		}
+	case ConvolutionBwdFilterAlgo:
+		if err := result(C.cudnnSetAlgorithmDescriptor(internal, C.makeConvBwdFilterAlgo(a.C()))); err != nil {
+			return nil, err
+		}
+	case ConvolutionBwdDataAlgo:
+		if err := result(C.cudnnSetAlgorithmDescriptor(internal, C.makeConvBwdDataAlgo(a.C()))); err != nil {
+			return nil, err
+		}
+	case RNNAlgo:
+		if err := result(C.cudnnSetAlgorithmDescriptor(internal, C.makeRNNAlgo(a.C()))); err != nil {
+			return nil, err
+		}
+	case CTCLossAlgo:
+		if err := result(C.cudnnSetAlgorithmDescriptor(internal, C.makeCTCLossAlgo(a.C()))); err != nil {
+			return nil, err
+		}
+	default:
+		return nil, fmt.Errorf(`unknown algorithm specified`)
+	}
+
+	retVal = &AlgorithmDescriptor{
+		internal:  internal,
+		algorithm: algorithm,
+	}
+	runtime.SetFinalizer(retVal, destroyAlgorithmDescriptor)
+	return retVal, nil
+}
+
+// C returns the internal cgo representation
+func (a *AlgorithmDescriptor) C() C.cudnnAlgorithmDescriptor_t { return a.internal }
+
+func destroyAlgorithmDescriptor(obj *AlgorithmDescriptor) {
+	C.cudnnDestroyAlgorithmDescriptor(obj.internal)
+}
diff --git a/dnn/generated_algorithmperformance.go b/dnn/generated_algorithmperformance.go
new file mode 100644
index 0000000..7314e96
--- /dev/null
+++ b/dnn/generated_algorithmperformance.go
@@ -0,0 +1,63 @@
+package cudnn
+
+/* WAS Generated by gencudnn. DO NOT EDIT */
+
+// #include <cudnn.h>
+import "C"
+import "runtime"
+
+// AlgorithmPerformance is a representation of cudnnAlgorithmPerformance_t.
+type AlgorithmPerformance struct {
+	internal C.cudnnAlgorithmPerformance_t
+
+	n int
+
+	algoDesc *AlgorithmDescriptor
+	status   Status
+	time     float32
+	memory   uintptr
+}
+
+// NewAlgorithmPerformance creates `n` new cudnnAlgorithmPerformance objects, wrapped in AlgorithmPerformance.
+func NewAlgorithmPerformance(algoDesc *AlgorithmDescriptor, n int, status Status, time float32, memory uintptr) (retVal *AlgorithmPerformance, err error) {
+	var internal C.cudnnAlgorithmPerformance_t
+	if err := result(C.cudnnCreateAlgorithmPerformance(&internal, C.int(n))); err != nil {
+		return nil, err
+	}
+
+	if err := result(C.cudnnSetAlgorithmPerformance(internal, algoDesc.internal, status.C(), C.float(time), C.size_t(memory))); err != nil {
+		return nil, err
+	}
+
+	retVal = &AlgorithmPerformance{
+		internal: internal,
+		algoDesc: algoDesc,
+		status:   status,
+		time:     time,
+		memory:   memory,
+	}
+	runtime.SetFinalizer(retVal, destroyAlgorithmPerformance)
+	return retVal, nil
+}
+
+// C returns the cgo representation.
+func (a *AlgorithmPerformance) C() C.cudnnAlgorithmPerformance_t { return a.internal }
+
+// AlgoDesc returns the internal algoDesc.
+func (a *AlgorithmPerformance) AlgoDesc() *AlgorithmDescriptor { return a.algoDesc }
+
+// Status returns the internal status.
+func (a *AlgorithmPerformance) Status() Status { return a.status }
+
+// Time returns the internal time.
+func (a *AlgorithmPerformance) Time() float32 { return a.time }
+
+// Memory returns the internal memory.
+func (a *AlgorithmPerformance) Memory() uintptr { return a.memory }
+
+// N returns how many algorithm performance objects were created in the graphics card.
+func (a *AlgorithmPerformance) N() int { return a.n }
+
+func destroyAlgorithmPerformance(obj *AlgorithmPerformance) {
+	C.cudnnDestroyAlgorithmPerformance(&obj.internal, C.int(obj.n))
+}
diff --git a/dnn/generated_attention.go b/dnn/generated_attention.go
new file mode 100644
index 0000000..71c1d7b
--- /dev/null
+++ b/dnn/generated_attention.go
@@ -0,0 +1,131 @@
+package cudnn
+
+/* Generated by gencudnn. DO NOT EDIT */
+
+// #include <cudnn.h>
+import "C"
+import "runtime"
+
+// Attention is a representation of cudnnAttnDescriptor_t.
+type Attention struct {
+	internal C.cudnnAttnDescriptor_t
+
+	attnMode        uint
+	nHeads          int
+	smScaler        float64
+	dataType        DataType
+	computePrec     DataType
+	mathType        MathType
+	attnDropoutDesc *Dropout
+	postDropoutDesc *Dropout
+	qSize           int
+	kSize           int
+	vSize           int
+	qProjSize       int
+	kProjSize       int
+	vProjSize       int
+	oProjSize       int
+	qoMaxSeqLength  int
+	kvMaxSeqLength  int
+	maxBatchSize    int
+	maxBeamSize     int
+}
+
+// NewAttention creates a new Attention.
+func NewAttention(attnMode uint, nHeads int, smScaler float64, dataType DataType, computePrec DataType, mathType MathType, attnDropoutDesc *Dropout, postDropoutDesc *Dropout, qSize int, kSize int, vSize int, qProjSize int, kProjSize int, vProjSize int, oProjSize int, qoMaxSeqLength int, kvMaxSeqLength int, maxBatchSize int, maxBeamSize int) (retVal *Attention, err error) {
+	var internal C.cudnnAttnDescriptor_t
+	if err := result(C.cudnnCreateAttnDescriptor(&internal)); err != nil {
+		return nil, err
+	}
+
+	if err := result(C.cudnnSetAttnDescriptor(internal, C.uint(attnMode), C.int(nHeads), C.double(smScaler), dataType.C(), computePrec.C(), mathType.C(), attnDropoutDesc.internal, postDropoutDesc.internal, C.int(qSize), C.int(kSize), C.int(vSize), C.int(qProjSize), C.int(kProjSize), C.int(vProjSize), C.int(oProjSize), C.int(qoMaxSeqLength), C.int(kvMaxSeqLength), C.int(maxBatchSize), C.int(maxBeamSize))); err != nil {
+		return nil, err
+	}
+
+	retVal = &Attention{
+		internal:        internal,
+		attnMode:        attnMode,
+		nHeads:          nHeads,
+		smScaler:        smScaler,
+		dataType:        dataType,
+		computePrec:     computePrec,
+		mathType:        mathType,
+		attnDropoutDesc: attnDropoutDesc,
+		postDropoutDesc: postDropoutDesc,
+		qSize:           qSize,
+		kSize:           kSize,
+		vSize:           vSize,
+		qProjSize:       qProjSize,
+		kProjSize:       kProjSize,
+		vProjSize:       vProjSize,
+		oProjSize:       oProjSize,
+		qoMaxSeqLength:  qoMaxSeqLength,
+		kvMaxSeqLength:  kvMaxSeqLength,
+		maxBatchSize:    maxBatchSize,
+		maxBeamSize:     maxBeamSize,
+	}
+	runtime.SetFinalizer(retVal, destroyAttention)
+	return retVal, nil
+}
+
+// C returns the internal cgo representation.
+func (a *Attention) C() C.cudnnAttnDescriptor_t { return a.internal }
+
+// AttnMode returns the internal attnMode.
+func (a *Attention) AttnMode() uint { return a.attnMode }
+
+// NHeads returns the internal nHeads.
+func (a *Attention) NHeads() int { return a.nHeads }
+
+// SmScaler returns the internal smScaler.
+func (a *Attention) SmScaler() float64 { return a.smScaler }
+
+// DataType returns the internal dataType.
+func (a *Attention) DataType() DataType { return a.dataType }
+
+// ComputePrec returns the internal computePrec.
+func (a *Attention) ComputePrec() DataType { return a.computePrec }
+
+// MathType returns the internal mathType.
+func (a *Attention) MathType() MathType { return a.mathType }
+
+// AttnDropoutDesc returns the internal attnDropoutDesc.
+func (a *Attention) AttnDropoutDesc() *Dropout { return a.attnDropoutDesc }
+
+// PostDropoutDesc returns the internal postDropoutDesc.
+func (a *Attention) PostDropoutDesc() *Dropout { return a.postDropoutDesc }
+
+// QSize returns the internal qSize.
+func (a *Attention) QSize() int { return a.qSize }
+
+// KSize returns the internal kSize.
+func (a *Attention) KSize() int { return a.kSize }
+
+// VSize returns the internal vSize.
+func (a *Attention) VSize() int { return a.vSize }
+
+// QProjSize returns the internal qProjSize.
+func (a *Attention) QProjSize() int { return a.qProjSize }
+
+// KProjSize returns the internal kProjSize.
+func (a *Attention) KProjSize() int { return a.kProjSize }
+
+// VProjSize returns the internal vProjSize.
+func (a *Attention) VProjSize() int { return a.vProjSize }
+
+// OProjSize returns the internal oProjSize.
+func (a *Attention) OProjSize() int { return a.oProjSize }
+
+// QoMaxSeqLength returns the internal qoMaxSeqLength.
+func (a *Attention) QoMaxSeqLength() int { return a.qoMaxSeqLength }
+
+// KvMaxSeqLength returns the internal kvMaxSeqLength.
+func (a *Attention) KvMaxSeqLength() int { return a.kvMaxSeqLength }
+
+// MaxBatchSize returns the internal maxBatchSize.
+func (a *Attention) MaxBatchSize() int { return a.maxBatchSize }
+
+// MaxBeamSize returns the internal maxBeamSize.
+func (a *Attention) MaxBeamSize() int { return a.maxBeamSize }
+
+func destroyAttention(obj *Attention) { C.cudnnDestroyAttnDescriptor(obj.internal) }
diff --git a/dnn/generated_backend.go b/dnn/generated_backend.go
new file mode 100644
index 0000000..0c18219
--- /dev/null
+++ b/dnn/generated_backend.go
@@ -0,0 +1,64 @@
+package cudnn
+
+/* WAS Generated by gencudnn. DO NOT EDIT */
+
+// #include <cudnn.h>
+import "C"
+import (
+	"runtime"
+	"unsafe"
+)
+
+// Backend is a representation of cudnnBackendDescriptor_t.
+type Backend struct {
+	internal C.cudnnBackendDescriptor_t
+
+	backendType     BackendDescriptorType
+	attributeName   BackendAttributeName
+	attributeType   BackendAttributeType
+	elementCount    int64
+	arrayOfElements Memory
+}
+
+// NewBackend creates a new Backend.
+func NewBackend(attributeName BackendAttributeName, attributeType BackendAttributeType, backendType BackendDescriptorType, elementCount int64, arrayOfElements Memory) (retVal *Backend, err error) {
+	var internal C.cudnnBackendDescriptor_t
+	if err := result(C.cudnnBackendCreateDescriptor(backendType.C(), &internal)); err != nil {
+		return nil, err
+	}
+
+	if err := result(C.cudnnBackendSetAttribute(internal, attributeName.C(), attributeType.C(), C.int64_t(elementCount), unsafe.Pointer(arrayOfElements.Uintptr()))); err != nil {
+		return nil, err
+	}
+
+	retVal = &Backend{
+		internal:        internal,
+		backendType:     backendType,
+		attributeName:   attributeName,
+		attributeType:   attributeType,
+		elementCount:    elementCount,
+		arrayOfElements: arrayOfElements,
+	}
+	runtime.SetFinalizer(retVal, destroyBackend)
+	return retVal, nil
+}
+
+// C() returns the internal cgo representation
+func (b *Backend) C() C.cudnnBackendDescriptor_t { return b.internal }
+
+// Type returns the backend type.
+func (b *Backend) Type() BackendDescriptorType { return b.backendType }
+
+// AttributeName returns the internal attributeName.
+func (b *Backend) AttributeName() BackendAttributeName { return b.attributeName }
+
+// AttributeType returns the internal attributeType.
+func (b *Backend) AttributeType() BackendAttributeType { return b.attributeType }
+
+// ElementCount returns the internal elementCount parameter.
+func (b *Backend) ElementCount() int64 { return b.elementCount }
+
+// ArrayOfElements returns the internal arrayOfElements.
+func (b *Backend) ArrayOfElements() Memory { return b.arrayOfElements }
+
+func destroyBackend(obj *Backend) { C.cudnnBackendDestroyDescriptor(obj.internal) }
diff --git a/dnn/generated_ctcloss.go b/dnn/generated_ctcloss.go
index 7fc8fe3..57ca7a1 100644
--- a/dnn/generated_ctcloss.go
+++ b/dnn/generated_ctcloss.go
@@ -4,35 +4,24 @@ package cudnn
 
 // #include <cudnn.h>
 import "C"
-import "runtime"
+import "github.com/pkg/errors"
 
 // CTCLoss is a representation of cudnnCTCLossDescriptor_t.
 type CTCLoss struct {
 	internal C.cudnnCTCLossDescriptor_t
 
-	compType DataType
+	//TODO
 }
 
 // NewCTCLoss creates a new CTCLoss.
 func NewCTCLoss(compType DataType) (retVal *CTCLoss, err error) {
-	var internal C.cudnnCTCLossDescriptor_t
-	if err := result(C.cudnnCreateCTCLossDescriptor(&internal)); err != nil {
-		return nil, err
-	}
-
-	if err := result(C.cudnnSetCTCLossDescriptor(internal, compType.C())); err != nil {
-		return nil, err
-	}
-
-	retVal = &CTCLoss{
-		internal: internal,
-		compType: compType,
-	}
-	runtime.SetFinalizer(retVal, destroyCTCLoss)
-	return retVal, nil
+	// available "Set" methods:
+	//	cudnnSetCTCLossDescriptor
+	//	cudnnSetCTCLossDescriptorEx
+	//	cudnnSetCTCLossDescriptor_v8
+	return nil, errors.Errorf("TODO: Manual Intervention required")
 }
 
-// CompType returns the internal compType.
-func (c *CTCLoss) CompType() DataType { return c.compType }
+// TODO: Getters for CTCLoss
 
 func destroyCTCLoss(obj *CTCLoss) { C.cudnnDestroyCTCLossDescriptor(obj.internal) }
diff --git a/dnn/generated_enums.go b/dnn/generated_enums.go
index 9713cbb..4df875d 100644
--- a/dnn/generated_enums.go
+++ b/dnn/generated_enums.go
@@ -5,60 +5,290 @@ package cudnn
 // #include <cudnn.h>
 import "C"
 
-//go:generate stringer -type=ErrQueryMode
+type ActivationMode int
 
-type ErrQueryMode int
+const (
+	Sigmoid     ActivationMode = C.CUDNN_ACTIVATION_SIGMOID
+	ReLU        ActivationMode = C.CUDNN_ACTIVATION_RELU
+	Tanh        ActivationMode = C.CUDNN_ACTIVATION_TANH
+	ClippedReLU ActivationMode = C.CUDNN_ACTIVATION_CLIPPED_RELU
+	Elu         ActivationMode = C.CUDNN_ACTIVATION_ELU
+	Identity    ActivationMode = C.CUDNN_ACTIVATION_IDENTITY
+)
+
+// C returns the C representation of ActivationMode
+func (e ActivationMode) C() C.cudnnActivationMode_t { return C.cudnnActivationMode_t(e) }
+
+type BackendAttributeName int
 
 const (
-	Rawcode     ErrQueryMode = C.CUDNN_ERRQUERY_RAWCODE
-	Nonblocking ErrQueryMode = C.CUDNN_ERRQUERY_NONBLOCKING
-	Blocking    ErrQueryMode = C.CUDNN_ERRQUERY_BLOCKING
+	BEAttrNamePointwiseMode                         BackendAttributeName = C.CUDNN_ATTR_POINTWISE_MODE
+	BEAttrNamePointwiseMathPrec                     BackendAttributeName = C.CUDNN_ATTR_POINTWISE_MATH_PREC
+	BEAttrNamePointwiseNanPropagation               BackendAttributeName = C.CUDNN_ATTR_POINTWISE_NAN_PROPAGATION
+	BEAttrNamePointwiseReluLowerClip                BackendAttributeName = C.CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP
+	BEAttrNamePointwiseReluUpperClip                BackendAttributeName = C.CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP
+	BEAttrNameConvolutionCompType                   BackendAttributeName = C.CUDNN_ATTR_CONVOLUTION_COMP_TYPE
+	BEAttrNameConvolutionConvMode                   BackendAttributeName = C.CUDNN_ATTR_CONVOLUTION_CONV_MODE
+	BEAttrNameConvolutionDilations                  BackendAttributeName = C.CUDNN_ATTR_CONVOLUTION_DILATIONS
+	BEAttrNameConvolutionFilterStrides              BackendAttributeName = C.CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES
+	BEAttrNameConvolutionPostPaddings               BackendAttributeName = C.CUDNN_ATTR_CONVOLUTION_POST_PADDINGS
+	BEAttrNameConvolutionPrePaddings                BackendAttributeName = C.CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS
+	BEAttrNameConvolutionSpatialDims                BackendAttributeName = C.CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS
+	BEAttrNameEngineheurMode                        BackendAttributeName = C.CUDNN_ATTR_ENGINEHEUR_MODE
+	BEAttrNameEngineheurOperationGraph              BackendAttributeName = C.CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH
+	BEAttrNameEngineheurResults                     BackendAttributeName = C.CUDNN_ATTR_ENGINEHEUR_RESULTS
+	BEAttrNameEnginecfgEngine                       BackendAttributeName = C.CUDNN_ATTR_ENGINECFG_ENGINE
+	BEAttrNameEnginecfgIntermediateInfo             BackendAttributeName = C.CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO
+	BEAttrNameEnginecfgKnobChoices                  BackendAttributeName = C.CUDNN_ATTR_ENGINECFG_KNOB_CHOICES
+	BEAttrNameExecutionPlanHandle                   BackendAttributeName = C.CUDNN_ATTR_EXECUTION_PLAN_HANDLE
+	BEAttrNameExecutionPlanEngineConfig             BackendAttributeName = C.CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG
+	BEAttrNameExecutionPlanWorkspaceSize            BackendAttributeName = C.CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE
+	BEAttrNameExecutionPlanComputedIntermediateUids BackendAttributeName = C.CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS
+	BEAttrNameExecutionPlanRunOnlyIntermediateUids  BackendAttributeName = C.CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS
+	BEAttrNameIntermediateInfoUniqueId              BackendAttributeName = C.CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID
+	BEAttrNameIntermediateInfoSize                  BackendAttributeName = C.CUDNN_ATTR_INTERMEDIATE_INFO_SIZE
+	BEAttrNameIntermediateInfoDependentDataUids     BackendAttributeName = C.CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS
+	BEAttrNameIntermediateInfoDependentAttributes   BackendAttributeName = C.CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES
+	BEAttrNameKnobChoiceKnobType                    BackendAttributeName = C.CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE
+	BEAttrNameKnobChoiceKnobValue                   BackendAttributeName = C.CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE
+	BEAttrNameOperationConvolutionForwardAlpha      BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA
+	BEAttrNameOperationConvolutionForwardBeta       BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA
+	BEAttrNameOperationConvolutionForwardConvDesc   BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC
+	BEAttrNameOperationConvolutionForwardW          BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W
+	BEAttrNameOperationConvolutionForwardX          BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X
+	BEAttrNameOperationConvolutionForwardY          BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y
+	BEAttrNameOperationConvolutionBwdDataAlpha      BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA
+	BEAttrNameOperationConvolutionBwdDataBeta       BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA
+	BEAttrNameOperationConvolutionBwdDataConvDesc   BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC
+	BEAttrNameOperationConvolutionBwdDataW          BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W
+	BEAttrNameOperationConvolutionBwdDataDx         BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX
+	BEAttrNameOperationConvolutionBwdDataDy         BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY
+	BEAttrNameOperationConvolutionBwdFilterAlpha    BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA
+	BEAttrNameOperationConvolutionBwdFilterBeta     BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA
+	BEAttrNameOperationConvolutionBwdFilterConvDesc BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC
+	BEAttrNameOperationConvolutionBwdFilterDw       BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW
+	BEAttrNameOperationConvolutionBwdFilterX        BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X
+	BEAttrNameOperationConvolutionBwdFilterDy       BackendAttributeName = C.CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY
+	BEAttrNameOperationPointwisePwDescriptor        BackendAttributeName = C.CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR
+	BEAttrNameOperationPointwiseXdesc               BackendAttributeName = C.CUDNN_ATTR_OPERATION_POINTWISE_XDESC
+	BEAttrNameOperationPointwiseBdesc               BackendAttributeName = C.CUDNN_ATTR_OPERATION_POINTWISE_BDESC
+	BEAttrNameOperationPointwiseYdesc               BackendAttributeName = C.CUDNN_ATTR_OPERATION_POINTWISE_YDESC
+	BEAttrNameOperationPointwiseAlpha1              BackendAttributeName = C.CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1
+	BEAttrNameOperationPointwiseAlpha2              BackendAttributeName = C.CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2
+	BEAttrNameOperationGenstatsMode                 BackendAttributeName = C.CUDNN_ATTR_OPERATION_GENSTATS_MODE
+	BEAttrNameOperationGenstatsMathPrec             BackendAttributeName = C.CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC
+	BEAttrNameOperationGenstatsXdesc                BackendAttributeName = C.CUDNN_ATTR_OPERATION_GENSTATS_XDESC
+	BEAttrNameOperationGenstatsSumdesc              BackendAttributeName = C.CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC
+	BEAttrNameOperationGenstatsSqsumdesc            BackendAttributeName = C.CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC
+	BEAttrNameOperationgraphHandle                  BackendAttributeName = C.CUDNN_ATTR_OPERATIONGRAPH_HANDLE
+	BEAttrNameOperationgraphOps                     BackendAttributeName = C.CUDNN_ATTR_OPERATIONGRAPH_OPS
+	BEAttrNameOperationgraphEngineGlobalCount       BackendAttributeName = C.CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT
+	BEAttrNameTensorByteAlignment                   BackendAttributeName = C.CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT
+	BEAttrNameTensorDataType                        BackendAttributeName = C.CUDNN_ATTR_TENSOR_DATA_TYPE
+	BEAttrNameTensorDimensions                      BackendAttributeName = C.CUDNN_ATTR_TENSOR_DIMENSIONS
+	BEAttrNameTensorStrides                         BackendAttributeName = C.CUDNN_ATTR_TENSOR_STRIDES
+	BEAttrNameTensorVectorCount                     BackendAttributeName = C.CUDNN_ATTR_TENSOR_VECTOR_COUNT
+	BEAttrNameTensorVectorizedDimension             BackendAttributeName = C.CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION
+	BEAttrNameTensorUniqueId                        BackendAttributeName = C.CUDNN_ATTR_TENSOR_UNIQUE_ID
+	BEAttrNameTensorIsVirtual                       BackendAttributeName = C.CUDNN_ATTR_TENSOR_IS_VIRTUAL
+	BEAttrNameVariantPackUniqueIds                  BackendAttributeName = C.CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS
+	BEAttrNameVariantPackDataPointers               BackendAttributeName = C.CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS
+	BEAttrNameVariantPackIntermediates              BackendAttributeName = C.CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES
+	BEAttrNameVariantPackWorkspace                  BackendAttributeName = C.CUDNN_ATTR_VARIANT_PACK_WORKSPACE
+	BEAttrNameLayoutInfoTensorUid                   BackendAttributeName = C.CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID
+	BEAttrNameLayoutInfoTypes                       BackendAttributeName = C.CUDNN_ATTR_LAYOUT_INFO_TYPES
+	BEAttrNameKnobInfoType                          BackendAttributeName = C.CUDNN_ATTR_KNOB_INFO_TYPE
+	BEAttrNameKnobInfoMaximumValue                  BackendAttributeName = C.CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE
+	BEAttrNameKnobInfoMinimumValue                  BackendAttributeName = C.CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE
+	BEAttrNameKnobInfoStride                        BackendAttributeName = C.CUDNN_ATTR_KNOB_INFO_STRIDE
+	BEAttrNameEngineOperationGraph                  BackendAttributeName = C.CUDNN_ATTR_ENGINE_OPERATION_GRAPH
+	BEAttrNameEngineGlobalIndex                     BackendAttributeName = C.CUDNN_ATTR_ENGINE_GLOBAL_INDEX
+	BEAttrNameEngineKnobInfo                        BackendAttributeName = C.CUDNN_ATTR_ENGINE_KNOB_INFO
+	BEAttrNameEngineNumericalNote                   BackendAttributeName = C.CUDNN_ATTR_ENGINE_NUMERICAL_NOTE
+	BEAttrNameEngineLayoutInfo                      BackendAttributeName = C.CUDNN_ATTR_ENGINE_LAYOUT_INFO
 )
 
-// C returns the C representation of ErrQueryMode
-func (e ErrQueryMode) C() C.cudnnErrQueryMode_t { return C.cudnnErrQueryMode_t(e) }
+// C returns the C representation of BackendAttributeName
+func (e BackendAttributeName) C() C.cudnnBackendAttributeName_t {
+	return C.cudnnBackendAttributeName_t(e)
+}
 
-//go:generate stringer -type=DataType
+type BackendAttributeType int
 
-type DataType int
+const (
+	BEAttrHandle            BackendAttributeType = C.CUDNN_TYPE_HANDLE
+	BEAttrDataType          BackendAttributeType = C.CUDNN_TYPE_DATA_TYPE
+	BEAttrBoolean           BackendAttributeType = C.CUDNN_TYPE_BOOLEAN
+	BEAttrInt64             BackendAttributeType = C.CUDNN_TYPE_INT64
+	BEAttrFloat             BackendAttributeType = C.CUDNN_TYPE_FLOAT
+	BEAttrDouble            BackendAttributeType = C.CUDNN_TYPE_DOUBLE
+	BEAttrVoidPtr           BackendAttributeType = C.CUDNN_TYPE_VOID_PTR
+	BEAttrConvolutionMode   BackendAttributeType = C.CUDNN_TYPE_CONVOLUTION_MODE
+	BEAttrHeurMode          BackendAttributeType = C.CUDNN_TYPE_HEUR_MODE
+	BEAttrKnobType          BackendAttributeType = C.CUDNN_TYPE_KNOB_TYPE
+	BEAttrNanPropogation    BackendAttributeType = C.CUDNN_TYPE_NAN_PROPOGATION
+	BEAttrNumericalNote     BackendAttributeType = C.CUDNN_TYPE_NUMERICAL_NOTE
+	BEAttrLayoutType        BackendAttributeType = C.CUDNN_TYPE_LAYOUT_TYPE
+	BEAttrAttribName        BackendAttributeType = C.CUDNN_TYPE_ATTRIB_NAME
+	BEAttrPointwiseMode     BackendAttributeType = C.CUDNN_TYPE_POINTWISE_MODE
+	BEAttrBackendDescriptor BackendAttributeType = C.CUDNN_TYPE_BACKEND_DESCRIPTOR
+	BEAttrGenstatsMode      BackendAttributeType = C.CUDNN_TYPE_GENSTATS_MODE
+)
+
+// C returns the C representation of BackendAttributeType
+func (e BackendAttributeType) C() C.cudnnBackendAttributeType_t {
+	return C.cudnnBackendAttributeType_t(e)
+}
+
+type BackendDescriptorType int
 
 const (
-	Float  DataType = C.CUDNN_DATA_FLOAT
-	Double DataType = C.CUDNN_DATA_DOUBLE
-	Half   DataType = C.CUDNN_DATA_HALF
-	Int8   DataType = C.CUDNN_DATA_INT8
-	Int32  DataType = C.CUDNN_DATA_INT32
-	Int8x4 DataType = C.CUDNN_DATA_INT8x4
+	BEDescriptorPointwiseDescriptor                          BackendDescriptorType = C.CUDNN_BACKEND_POINTWISE_DESCRIPTOR
+	BEDescriptorConvolutionDescriptor                        BackendDescriptorType = C.CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR
+	BEDescriptorEngineDescriptor                             BackendDescriptorType = C.CUDNN_BACKEND_ENGINE_DESCRIPTOR
+	BEDescriptorEnginecfgDescriptor                          BackendDescriptorType = C.CUDNN_BACKEND_ENGINECFG_DESCRIPTOR
+	BEDescriptorEngineheurDescriptor                         BackendDescriptorType = C.CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR
+	BEDescriptorExecutionPlanDescriptor                      BackendDescriptorType = C.CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR
+	BEDescriptorIntermediateInfoDescriptor                   BackendDescriptorType = C.CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR
+	BEDescriptorKnobChoiceDescriptor                         BackendDescriptorType = C.CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR
+	BEDescriptorKnobInfoDescriptor                           BackendDescriptorType = C.CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR
+	BEDescriptorLayoutInfoDescriptor                         BackendDescriptorType = C.CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR
+	BEDescriptorOperationConvolutionForwardDescriptor        BackendDescriptorType = C.CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR
+	BEDescriptorOperationConvolutionBackwardFilterDescriptor BackendDescriptorType = C.CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
+	BEDescriptorOperationConvolutionBackwardDataDescriptor   BackendDescriptorType = C.CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
+	BEDescriptorOperationPointwiseDescriptor                 BackendDescriptorType = C.CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR
+	BEDescriptorOperationGenStatsDescriptor                  BackendDescriptorType = C.CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR
+	BEDescriptorOperationgraphDescriptor                     BackendDescriptorType = C.CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR
+	BEDescriptorVariantPackDescriptor                        BackendDescriptorType = C.CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR
+	BEDescriptorTensorDescriptor                             BackendDescriptorType = C.CUDNN_BACKEND_TENSOR_DESCRIPTOR
 )
 
-// C returns the C representation of DataType
-func (e DataType) C() C.cudnnDataType_t { return C.cudnnDataType_t(e) }
+// C returns the C representation of BackendDescriptorType
+func (e BackendDescriptorType) C() C.cudnnBackendDescriptorType_t {
+	return C.cudnnBackendDescriptorType_t(e)
+}
 
-//go:generate stringer -type=MathType
+type BackendHeurMode int
 
-type MathType int
+const (
+	Instant BackendHeurMode = C.CUDNN_HEUR_MODE_INSTANT
+	SCount  BackendHeurMode = C.CUDNN_HEUR_MODES_COUNT
+)
+
+// C returns the C representation of BackendHeurMode
+func (e BackendHeurMode) C() C.cudnnBackendHeurMode_t { return C.cudnnBackendHeurMode_t(e) }
+
+type BackendKnobType int
 
 const (
-	DefaultMath  MathType = C.CUDNN_DEFAULT_MATH
-	TensorOpMath MathType = C.CUDNN_TENSOR_OP_MATH
+	SplitK        BackendKnobType = C.CUDNN_KNOB_TYPE_SPLIT_K
+	Swizzle       BackendKnobType = C.CUDNN_KNOB_TYPE_SWIZZLE
+	TileSize      BackendKnobType = C.CUDNN_KNOB_TYPE_TILE_SIZE
+	UseTex        BackendKnobType = C.CUDNN_KNOB_TYPE_USE_TEX
+	Edge          BackendKnobType = C.CUDNN_KNOB_TYPE_EDGE
+	Kblock        BackendKnobType = C.CUDNN_KNOB_TYPE_KBLOCK
+	Ldga          BackendKnobType = C.CUDNN_KNOB_TYPE_LDGA
+	Ldgb          BackendKnobType = C.CUDNN_KNOB_TYPE_LDGB
+	ChunkK        BackendKnobType = C.CUDNN_KNOB_TYPE_CHUNK_K
+	SplitH        BackendKnobType = C.CUDNN_KNOB_TYPE_SPLIT_H
+	WinoTile      BackendKnobType = C.CUDNN_KNOB_TYPE_WINO_TILE
+	Multiply      BackendKnobType = C.CUDNN_KNOB_TYPE_MULTIPLY
+	SplitKBuf     BackendKnobType = C.CUDNN_KNOB_TYPE_SPLIT_K_BUF
+	Tilek         BackendKnobType = C.CUDNN_KNOB_TYPE_TILEK
+	Stages        BackendKnobType = C.CUDNN_KNOB_TYPE_STAGES
+	ReductionMode BackendKnobType = C.CUDNN_KNOB_TYPE_REDUCTION_MODE
+	CtaSplitKMode BackendKnobType = C.CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE
+	SplitKSlc     BackendKnobType = C.CUDNN_KNOB_TYPE_SPLIT_K_SLC
+	IdxMode       BackendKnobType = C.CUDNN_KNOB_TYPE_IDX_MODE
+	Sliced        BackendKnobType = C.CUDNN_KNOB_TYPE_SLICED
+	SplitRs       BackendKnobType = C.CUDNN_KNOB_TYPE_SPLIT_RS
+	Singlebuffer  BackendKnobType = C.CUDNN_KNOB_TYPE_SINGLEBUFFER
+	Ldgc          BackendKnobType = C.CUDNN_KNOB_TYPE_LDGC
+	Specfilt      BackendKnobType = C.CUDNN_KNOB_TYPE_SPECFILT
+	Counts        BackendKnobType = C.CUDNN_KNOB_TYPE_COUNTS
 )
 
-// C returns the C representation of MathType
-func (e MathType) C() C.cudnnMathType_t { return C.cudnnMathType_t(e) }
+// C returns the C representation of BackendKnobType
+func (e BackendKnobType) C() C.cudnnBackendKnobType_t { return C.cudnnBackendKnobType_t(e) }
 
-//go:generate stringer -type=NanPropagation
+type BackendLayoutType int
 
-type NanPropagation int
+const (
+	BELayoutPreferredNchw   BackendLayoutType = C.CUDNN_LAYOUT_TYPE_PREFERRED_NCHW
+	BELayoutPreferredNhwc   BackendLayoutType = C.CUDNN_LAYOUT_TYPE_PREFERRED_NHWC
+	BELayoutPreferredPad4ck BackendLayoutType = C.CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK
+	BELayoutPreferredPad8ck BackendLayoutType = C.CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK
+	BELayoutCount           BackendLayoutType = C.CUDNN_LAYOUT_TYPE_COUNT
+)
+
+// C returns the C representation of BackendLayoutType
+func (e BackendLayoutType) C() C.cudnnBackendLayoutType_t { return C.cudnnBackendLayoutType_t(e) }
+
+type BackendNumericalNote int
 
 const (
-	NotPropagateNan NanPropagation = C.CUDNN_NOT_PROPAGATE_NAN
-	PropagateNan    NanPropagation = C.CUDNN_PROPAGATE_NAN
+	TensorCore                BackendNumericalNote = C.CUDNN_NUMERICAL_NOTE_TENSOR_CORE
+	DownConvertInputs         BackendNumericalNote = C.CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS
+	ReducedPrecisionReduction BackendNumericalNote = C.CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION
+	Fft                       BackendNumericalNote = C.CUDNN_NUMERICAL_NOTE_FFT
+	Nondeterministic          BackendNumericalNote = C.CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC
+	Winograd                  BackendNumericalNote = C.CUDNN_NUMERICAL_NOTE_WINOGRAD
+	TypeCount                 BackendNumericalNote = C.CUDNN_NUMERICAL_NOTE_TYPE_COUNT
 )
 
-// C returns the C representation of NanPropagation
-func (e NanPropagation) C() C.cudnnNanPropagation_t { return C.cudnnNanPropagation_t(e) }
+// C returns the C representation of BackendNumericalNote
+func (e BackendNumericalNote) C() C.cudnnBackendNumericalNote_t {
+	return C.cudnnBackendNumericalNote_t(e)
+}
 
-//go:generate stringer -type=Determinism
+type BatchNormMode int
+
+const (
+	PerActivation     BatchNormMode = C.CUDNN_BATCHNORM_PER_ACTIVATION
+	Spatial           BatchNormMode = C.CUDNN_BATCHNORM_SPATIAL
+	SpatialPersistent BatchNormMode = C.CUDNN_BATCHNORM_SPATIAL_PERSISTENT
+)
+
+// C returns the C representation of BatchNormMode
+func (e BatchNormMode) C() C.cudnnBatchNormMode_t { return C.cudnnBatchNormMode_t(e) }
+
+type BatchNormOps int
+
+const (
+	BatchNorm              BatchNormOps = C.CUDNN_BATCHNORM_OPS_BN
+	BatchNormActivation    BatchNormOps = C.CUDNN_BATCHNORM_OPS_BN_ACTIVATION
+	BatchNormAddActivation BatchNormOps = C.CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION
+)
+
+// C returns the C representation of BatchNormOps
+func (e BatchNormOps) C() C.cudnnBatchNormOps_t { return C.cudnnBatchNormOps_t(e) }
+
+type CTCLossAlgo int
+
+const (
+	DeterministicCTCLoss    CTCLossAlgo = C.CUDNN_CTC_LOSS_ALGO_DETERMINISTIC
+	NonDeterministicCTCLoss CTCLossAlgo = C.CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC
+)
+
+// C returns the C representation of CTCLossAlgo
+func (e CTCLossAlgo) C() C.cudnnCTCLossAlgo_t { return C.cudnnCTCLossAlgo_t(e) }
+
+type DataType int
+
+const (
+	Float   DataType = C.CUDNN_DATA_FLOAT
+	Double  DataType = C.CUDNN_DATA_DOUBLE
+	Half    DataType = C.CUDNN_DATA_HALF
+	Int8    DataType = C.CUDNN_DATA_INT8
+	Int32   DataType = C.CUDNN_DATA_INT32
+	Int8x4  DataType = C.CUDNN_DATA_INT8x4
+	Uint8   DataType = C.CUDNN_DATA_UINT8
+	Uint8x4 DataType = C.CUDNN_DATA_UINT8x4
+	Int8x32 DataType = C.CUDNN_DATA_INT8x32
+)
+
+// C returns the C representation of DataType
+func (e DataType) C() C.cudnnDataType_t { return C.cudnnDataType_t(e) }
 
 type Determinism int
 
@@ -70,67 +300,179 @@ const (
 // C returns the C representation of Determinism
 func (e Determinism) C() C.cudnnDeterminism_t { return C.cudnnDeterminism_t(e) }
 
-//go:generate stringer -type=TensorFormat
+type DirectionMode int
 
-type TensorFormat int
+const (
+	Unidirectional DirectionMode = C.CUDNN_UNIDIRECTIONAL
+	Bidirectional  DirectionMode = C.CUDNN_BIDIRECTIONAL
+)
+
+// C returns the C representation of DirectionMode
+func (e DirectionMode) C() C.cudnnDirectionMode_t { return C.cudnnDirectionMode_t(e) }
+
+type DivNormMode int
 
 const (
-	NCHW      TensorFormat = C.CUDNN_TENSOR_NCHW
-	NHWC      TensorFormat = C.CUDNN_TENSOR_NHWC
-	NCHWVectC TensorFormat = C.CUDNN_TENSOR_NCHW_VECT_C
+	PrecomputedMeans DivNormMode = C.CUDNN_DIVNORM_PRECOMPUTED_MEANS
 )
 
-// C returns the C representation of TensorFormat
-func (e TensorFormat) C() C.cudnnTensorFormat_t { return C.cudnnTensorFormat_t(e) }
+// C returns the C representation of DivNormMode
+func (e DivNormMode) C() C.cudnnDivNormMode_t { return C.cudnnDivNormMode_t(e) }
 
-//go:generate stringer -type=OpTensorOp
+type ErrQueryMode int
 
-type OpTensorOp int
+const (
+	Rawcode     ErrQueryMode = C.CUDNN_ERRQUERY_RAWCODE
+	Nonblocking ErrQueryMode = C.CUDNN_ERRQUERY_NONBLOCKING
+	Blocking    ErrQueryMode = C.CUDNN_ERRQUERY_BLOCKING
+)
+
+// C returns the C representation of ErrQueryMode
+func (e ErrQueryMode) C() C.cudnnErrQueryMode_t { return C.cudnnErrQueryMode_t(e) }
+
+type FoldingDirection int
 
 const (
-	Add  OpTensorOp = C.CUDNN_OP_TENSOR_ADD
-	Mul  OpTensorOp = C.CUDNN_OP_TENSOR_MUL
-	Min  OpTensorOp = C.CUDNN_OP_TENSOR_MIN
-	Max  OpTensorOp = C.CUDNN_OP_TENSOR_MAX
-	Sqrt OpTensorOp = C.CUDNN_OP_TENSOR_SQRT
-	Not  OpTensorOp = C.CUDNN_OP_TENSOR_NOT
+	Fold   FoldingDirection = C.CUDNN_TRANSFORM_FOLD
+	Unfold FoldingDirection = C.CUDNN_TRANSFORM_UNFOLD
 )
 
-// C returns the C representation of OpTensorOp
-func (e OpTensorOp) C() C.cudnnOpTensorOp_t { return C.cudnnOpTensorOp_t(e) }
+// C returns the C representation of FoldingDirection
+func (e FoldingDirection) C() C.cudnnFoldingDirection_t { return C.cudnnFoldingDirection_t(e) }
 
-//go:generate stringer -type=ReduceTensorOp
+type ForwardMode int
 
-type ReduceTensorOp int
+const (
+	Inference ForwardMode = C.CUDNN_FWD_MODE_INFERENCE
+	Training  ForwardMode = C.CUDNN_FWD_MODE_TRAINING
+)
+
+// C returns the C representation of ForwardMode
+func (e ForwardMode) C() C.cudnnForwardMode_t { return C.cudnnForwardMode_t(e) }
+
+type FusedOpsConstParamLabel int
 
 const (
-	ReduceAdd        ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_ADD
-	ReduceMul        ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_MUL
-	ReduceMin        ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_MIN
-	ReduceMax        ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_MAX
-	ReduceAmax       ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_AMAX
-	ReduceAvg        ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_AVG
-	ReduceNorm1      ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_NORM1
-	ReduceNorm2      ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_NORM2
-	ReduceMulNoZeros ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS
+	Xdesc                        FusedOpsConstParamLabel = C.CUDNN_PARAM_XDESC
+	XdataPlaceholder             FusedOpsConstParamLabel = C.CUDNN_PARAM_XDATA_PLACEHOLDER
+	BnMode                       FusedOpsConstParamLabel = C.CUDNN_PARAM_BN_MODE
+	BnEqscalebiasDesc            FusedOpsConstParamLabel = C.CUDNN_PARAM_BN_EQSCALEBIAS_DESC
+	BnEqscalePlaceholder         FusedOpsConstParamLabel = C.CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER
+	BnEqbiasPlaceholder          FusedOpsConstParamLabel = C.CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER
+	ActivationDesc               FusedOpsConstParamLabel = C.CUDNN_PARAM_ACTIVATION_DESC
+	ConvDesc                     FusedOpsConstParamLabel = C.CUDNN_PARAM_CONV_DESC
+	Wdesc                        FusedOpsConstParamLabel = C.CUDNN_PARAM_WDESC
+	WdataPlaceholder             FusedOpsConstParamLabel = C.CUDNN_PARAM_WDATA_PLACEHOLDER
+	Dwdesc                       FusedOpsConstParamLabel = C.CUDNN_PARAM_DWDESC
+	DwdataPlaceholder            FusedOpsConstParamLabel = C.CUDNN_PARAM_DWDATA_PLACEHOLDER
+	Ydesc                        FusedOpsConstParamLabel = C.CUDNN_PARAM_YDESC
+	YdataPlaceholder             FusedOpsConstParamLabel = C.CUDNN_PARAM_YDATA_PLACEHOLDER
+	Dydesc                       FusedOpsConstParamLabel = C.CUDNN_PARAM_DYDESC
+	DydataPlaceholder            FusedOpsConstParamLabel = C.CUDNN_PARAM_DYDATA_PLACEHOLDER
+	YstatsDesc                   FusedOpsConstParamLabel = C.CUDNN_PARAM_YSTATS_DESC
+	YsumPlaceholder              FusedOpsConstParamLabel = C.CUDNN_PARAM_YSUM_PLACEHOLDER
+	YsqsumPlaceholder            FusedOpsConstParamLabel = C.CUDNN_PARAM_YSQSUM_PLACEHOLDER
+	BnScalebiasMeanvarDesc       FusedOpsConstParamLabel = C.CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC
+	BnScalePlaceholder           FusedOpsConstParamLabel = C.CUDNN_PARAM_BN_SCALE_PLACEHOLDER
+	BnBiasPlaceholder            FusedOpsConstParamLabel = C.CUDNN_PARAM_BN_BIAS_PLACEHOLDER
+	BnSavedMeanPlaceholder       FusedOpsConstParamLabel = C.CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER
+	BnSavedInvstdPlaceholder     FusedOpsConstParamLabel = C.CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER
+	BnRunningMeanPlaceholder     FusedOpsConstParamLabel = C.CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER
+	BnRunningVarPlaceholder      FusedOpsConstParamLabel = C.CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER
+	Zdesc                        FusedOpsConstParamLabel = C.CUDNN_PARAM_ZDESC
+	ZdataPlaceholder             FusedOpsConstParamLabel = C.CUDNN_PARAM_ZDATA_PLACEHOLDER
+	BnZEqscalebiasDesc           FusedOpsConstParamLabel = C.CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC
+	BnZEqscalePlaceholder        FusedOpsConstParamLabel = C.CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER
+	BnZEqbiasPlaceholder         FusedOpsConstParamLabel = C.CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER
+	ActivationBitmaskDesc        FusedOpsConstParamLabel = C.CUDNN_PARAM_ACTIVATION_BITMASK_DESC
+	ActivationBitmaskPlaceholder FusedOpsConstParamLabel = C.CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER
+	Dxdesc                       FusedOpsConstParamLabel = C.CUDNN_PARAM_DXDESC
+	DxdataPlaceholder            FusedOpsConstParamLabel = C.CUDNN_PARAM_DXDATA_PLACEHOLDER
+	Dzdesc                       FusedOpsConstParamLabel = C.CUDNN_PARAM_DZDESC
+	DzdataPlaceholder            FusedOpsConstParamLabel = C.CUDNN_PARAM_DZDATA_PLACEHOLDER
+	BnDscalePlaceholder          FusedOpsConstParamLabel = C.CUDNN_PARAM_BN_DSCALE_PLACEHOLDER
+	BnDbiasPlaceholder           FusedOpsConstParamLabel = C.CUDNN_PARAM_BN_DBIAS_PLACEHOLDER
 )
 
-// C returns the C representation of ReduceTensorOp
-func (e ReduceTensorOp) C() C.cudnnReduceTensorOp_t { return C.cudnnReduceTensorOp_t(e) }
+// C returns the C representation of FusedOpsConstParamLabel
+func (e FusedOpsConstParamLabel) C() C.cudnnFusedOpsConstParamLabel_t {
+	return C.cudnnFusedOpsConstParamLabel_t(e)
+}
 
-//go:generate stringer -type=ReduceTensorIndices
+type FusedOpsPointerPlaceHolder int
 
-type ReduceTensorIndices int
+const (
+	NullPtr        FusedOpsPointerPlaceHolder = C.CUDNN_PTR_NULL
+	PtrElemAligned FusedOpsPointerPlaceHolder = C.CUDNN_PTR_ELEM_ALIGNED
+	Ptr16          FusedOpsPointerPlaceHolder = C.CUDNN_PTR_16B_ALIGNED
+)
+
+// C returns the C representation of FusedOpsPointerPlaceHolder
+func (e FusedOpsPointerPlaceHolder) C() C.cudnnFusedOpsPointerPlaceHolder_t {
+	return C.cudnnFusedOpsPointerPlaceHolder_t(e)
+}
+
+type FusedOpsVariantParamLabel int
 
 const (
-	ReduceNoIndices        ReduceTensorIndices = C.CUDNN_REDUCE_TENSOR_NO_INDICES
-	ReduceFlattenedIndices ReduceTensorIndices = C.CUDNN_REDUCE_TENSOR_FLATTENED_INDICES
+	PtrXdata                        FusedOpsVariantParamLabel = C.CUDNN_PTR_XDATA
+	PtrBnEqscale                    FusedOpsVariantParamLabel = C.CUDNN_PTR_BN_EQSCALE
+	PtrBnEqbias                     FusedOpsVariantParamLabel = C.CUDNN_PTR_BN_EQBIAS
+	PtrWdata                        FusedOpsVariantParamLabel = C.CUDNN_PTR_WDATA
+	PtrDwdata                       FusedOpsVariantParamLabel = C.CUDNN_PTR_DWDATA
+	PtrYdata                        FusedOpsVariantParamLabel = C.CUDNN_PTR_YDATA
+	PtrDydata                       FusedOpsVariantParamLabel = C.CUDNN_PTR_DYDATA
+	PtrYsum                         FusedOpsVariantParamLabel = C.CUDNN_PTR_YSUM
+	PtrYsqsum                       FusedOpsVariantParamLabel = C.CUDNN_PTR_YSQSUM
+	PtrWorkspace                    FusedOpsVariantParamLabel = C.CUDNN_PTR_WORKSPACE
+	PtrBnScale                      FusedOpsVariantParamLabel = C.CUDNN_PTR_BN_SCALE
+	PtrBnBias                       FusedOpsVariantParamLabel = C.CUDNN_PTR_BN_BIAS
+	PtrBnSavedMean                  FusedOpsVariantParamLabel = C.CUDNN_PTR_BN_SAVED_MEAN
+	PtrBnSavedInvstd                FusedOpsVariantParamLabel = C.CUDNN_PTR_BN_SAVED_INVSTD
+	PtrBnRunningMean                FusedOpsVariantParamLabel = C.CUDNN_PTR_BN_RUNNING_MEAN
+	PtrBnRunningVar                 FusedOpsVariantParamLabel = C.CUDNN_PTR_BN_RUNNING_VAR
+	PtrZdata                        FusedOpsVariantParamLabel = C.CUDNN_PTR_ZDATA
+	PtrBnZEqscale                   FusedOpsVariantParamLabel = C.CUDNN_PTR_BN_Z_EQSCALE
+	PtrBnZEqbias                    FusedOpsVariantParamLabel = C.CUDNN_PTR_BN_Z_EQBIAS
+	PtrActivationBitmask            FusedOpsVariantParamLabel = C.CUDNN_PTR_ACTIVATION_BITMASK
+	PtrDxdata                       FusedOpsVariantParamLabel = C.CUDNN_PTR_DXDATA
+	PtrDzdata                       FusedOpsVariantParamLabel = C.CUDNN_PTR_DZDATA
+	PtrBnDscale                     FusedOpsVariantParamLabel = C.CUDNN_PTR_BN_DSCALE
+	PtrBnDbias                      FusedOpsVariantParamLabel = C.CUDNN_PTR_BN_DBIAS
+	ScalarSizeTWorkspaceSizeInBytes FusedOpsVariantParamLabel = C.CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES
+	ScalarInt64TBnAccumulationCount FusedOpsVariantParamLabel = C.CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT
+	ScalarDoubleBnExpAvgFactor      FusedOpsVariantParamLabel = C.CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR
+	ScalarDoubleBnEpsilon           FusedOpsVariantParamLabel = C.CUDNN_SCALAR_DOUBLE_BN_EPSILON
 )
 
-// C returns the C representation of ReduceTensorIndices
-func (e ReduceTensorIndices) C() C.cudnnReduceTensorIndices_t { return C.cudnnReduceTensorIndices_t(e) }
+// C returns the C representation of FusedOpsVariantParamLabel
+func (e FusedOpsVariantParamLabel) C() C.cudnnFusedOpsVariantParamLabel_t {
+	return C.cudnnFusedOpsVariantParamLabel_t(e)
+}
+
+type FusedOps int
+
+const (
+	ScaleBiasActivationConvBnstats   FusedOps = C.CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS
+	ScaleBiasActivationWgrad         FusedOps = C.CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD
+	BnFinalizeStatisticsTraining     FusedOps = C.CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING
+	BnFinalizeStatisticsInference    FusedOps = C.CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE
+	ConvScaleBiasAddActivation       FusedOps = C.CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION
+	ScaleBiasAddActivationGenBitmask FusedOps = C.CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK
+	DactivationForkDbatchnorm        FusedOps = C.CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM
+)
+
+// C returns the C representation of FusedOps
+func (e FusedOps) C() C.cudnnFusedOps_t { return C.cudnnFusedOps_t(e) }
+
+type GenStatsMode int
+
+const (
+	SumSq GenStatsMode = C.CUDNN_GENSTATS_SUM_SQSUM
+)
 
-//go:generate stringer -type=IndicesType
+// C returns the C representation of GenStatsMode
+func (e GenStatsMode) C() C.cudnnGenStatsMode_t { return C.cudnnGenStatsMode_t(e) }
 
 type IndicesType int
 
@@ -144,32 +486,128 @@ const (
 // C returns the C representation of IndicesType
 func (e IndicesType) C() C.cudnnIndicesType_t { return C.cudnnIndicesType_t(e) }
 
-//go:generate stringer -type=SoftmaxAlgorithm
+type LRNMode int
 
-type SoftmaxAlgorithm int
+const (
+	CrossChannelDim1 LRNMode = C.CUDNN_LRN_CROSS_CHANNEL_DIM1
+)
+
+// C returns the C representation of LRNMode
+func (e LRNMode) C() C.cudnnLRNMode_t { return C.cudnnLRNMode_t(e) }
+
+type LossNormalizationMode int
 
 const (
-	Fast     SoftmaxAlgorithm = C.CUDNN_SOFTMAX_FAST
-	Accurate SoftmaxAlgorithm = C.CUDNN_SOFTMAX_ACCURATE
-	Log      SoftmaxAlgorithm = C.CUDNN_SOFTMAX_LOG
+	LossNormNone    LossNormalizationMode = C.CUDNN_LOSS_NORMALIZATION_NONE
+	LossNormSoftmax LossNormalizationMode = C.CUDNN_LOSS_NORMALIZATION_SOFTMAX
 )
 
-// C returns the C representation of SoftmaxAlgorithm
-func (e SoftmaxAlgorithm) C() C.cudnnSoftmaxAlgorithm_t { return C.cudnnSoftmaxAlgorithm_t(e) }
+// C returns the C representation of LossNormalizationMode
+func (e LossNormalizationMode) C() C.cudnnLossNormalizationMode_t {
+	return C.cudnnLossNormalizationMode_t(e)
+}
 
-//go:generate stringer -type=SoftmaxMode
+type MathType int
 
-type SoftmaxMode int
+const (
+	DefaultMath                 MathType = C.CUDNN_DEFAULT_MATH
+	TensorOpMath                MathType = C.CUDNN_TENSOR_OP_MATH
+	TensorOpMathAllowConversion MathType = C.CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION
+	FmaMath                     MathType = C.CUDNN_FMA_MATH
+)
+
+// C returns the C representation of MathType
+func (e MathType) C() C.cudnnMathType_t { return C.cudnnMathType_t(e) }
+
+type MultiHeadAttnWeightKind int
 
 const (
-	Instance SoftmaxMode = C.CUDNN_SOFTMAX_MODE_INSTANCE
-	Channel  SoftmaxMode = C.CUDNN_SOFTMAX_MODE_CHANNEL
+	QWeights MultiHeadAttnWeightKind = C.CUDNN_MH_ATTN_Q_WEIGHTS
+	KWeights MultiHeadAttnWeightKind = C.CUDNN_MH_ATTN_K_WEIGHTS
+	VWeights MultiHeadAttnWeightKind = C.CUDNN_MH_ATTN_V_WEIGHTS
+	OWeights MultiHeadAttnWeightKind = C.CUDNN_MH_ATTN_O_WEIGHTS
+	QBiases  MultiHeadAttnWeightKind = C.CUDNN_MH_ATTN_Q_BIASES
+	KBiases  MultiHeadAttnWeightKind = C.CUDNN_MH_ATTN_K_BIASES
+	VBiases  MultiHeadAttnWeightKind = C.CUDNN_MH_ATTN_V_BIASES
+	OBiases  MultiHeadAttnWeightKind = C.CUDNN_MH_ATTN_O_BIASES
 )
 
-// C returns the C representation of SoftmaxMode
-func (e SoftmaxMode) C() C.cudnnSoftmaxMode_t { return C.cudnnSoftmaxMode_t(e) }
+// C returns the C representation of MultiHeadAttnWeightKind
+func (e MultiHeadAttnWeightKind) C() C.cudnnMultiHeadAttnWeightKind_t {
+	return C.cudnnMultiHeadAttnWeightKind_t(e)
+}
 
-//go:generate stringer -type=PoolingMode
+type NanPropagation int
+
+const (
+	NotPropagateNan NanPropagation = C.CUDNN_NOT_PROPAGATE_NAN
+	PropagateNan    NanPropagation = C.CUDNN_PROPAGATE_NAN
+)
+
+// C returns the C representation of NanPropagation
+func (e NanPropagation) C() C.cudnnNanPropagation_t { return C.cudnnNanPropagation_t(e) }
+
+type NormAlgo int
+
+const (
+	StandardNorm NormAlgo = C.CUDNN_NORM_ALGO_STANDARD
+	PersistNorm  NormAlgo = C.CUDNN_NORM_ALGO_PERSIST
+)
+
+// C returns the C representation of NormAlgo
+func (e NormAlgo) C() C.cudnnNormAlgo_t { return C.cudnnNormAlgo_t(e) }
+
+type NormMode int
+
+const (
+	NormPerActivation NormMode = C.CUDNN_NORM_PER_ACTIVATION
+	NormPerChannel    NormMode = C.CUDNN_NORM_PER_CHANNEL
+)
+
+// C returns the C representation of NormMode
+func (e NormMode) C() C.cudnnNormMode_t { return C.cudnnNormMode_t(e) }
+
+type NormOps int
+
+const (
+	Norm              NormOps = C.CUDNN_NORM_OPS_NORM
+	NormActivation    NormOps = C.CUDNN_NORM_OPS_NORM_ACTIVATION
+	NormAddActivation NormOps = C.CUDNN_NORM_OPS_NORM_ADD_ACTIVATION
+)
+
+// C returns the C representation of NormOps
+func (e NormOps) C() C.cudnnNormOps_t { return C.cudnnNormOps_t(e) }
+
+type OpTensorOp int
+
+const (
+	TensorAdd  OpTensorOp = C.CUDNN_OP_TENSOR_ADD
+	TensorMul  OpTensorOp = C.CUDNN_OP_TENSOR_MUL
+	TensorMin  OpTensorOp = C.CUDNN_OP_TENSOR_MIN
+	TensorMax  OpTensorOp = C.CUDNN_OP_TENSOR_MAX
+	TensorSqrt OpTensorOp = C.CUDNN_OP_TENSOR_SQRT
+	TensorNot  OpTensorOp = C.CUDNN_OP_TENSOR_NOT
+)
+
+// C returns the C representation of OpTensorOp
+func (e OpTensorOp) C() C.cudnnOpTensorOp_t { return C.cudnnOpTensorOp_t(e) }
+
+type PointwiseMode int
+
+const (
+	PointwiseAdd        PointwiseMode = C.CUDNN_POINTWISE_ADD
+	PointwiseMul        PointwiseMode = C.CUDNN_POINTWISE_MUL
+	PointwiseMin        PointwiseMode = C.CUDNN_POINTWISE_MIN
+	PointwiseMax        PointwiseMode = C.CUDNN_POINTWISE_MAX
+	PointwiseSqrt       PointwiseMode = C.CUDNN_POINTWISE_SQRT
+	PointwiseReluFwd    PointwiseMode = C.CUDNN_POINTWISE_RELU_FWD
+	PointwiseTanhFwd    PointwiseMode = C.CUDNN_POINTWISE_TANH_FWD
+	PointwiseSigmoidFwd PointwiseMode = C.CUDNN_POINTWISE_SIGMOID_FWD
+	PointwiseEluFwd     PointwiseMode = C.CUDNN_POINTWISE_ELU_FWD
+)
+
+// C returns the C representation of PointwiseMode
+func (e PointwiseMode) C() C.cudnnPointwiseMode_t { return C.cudnnPointwiseMode_t(e) }
 
 type PoolingMode int
 
@@ -183,68 +621,60 @@ const (
 // C returns the C representation of PoolingMode
 func (e PoolingMode) C() C.cudnnPoolingMode_t { return C.cudnnPoolingMode_t(e) }
 
-//go:generate stringer -type=ActivationMode
-
-type ActivationMode int
+type RNNAlgo int
 
 const (
-	Sigmoid     ActivationMode = C.CUDNN_ACTIVATION_SIGMOID
-	ReLU        ActivationMode = C.CUDNN_ACTIVATION_RELU
-	Tanh        ActivationMode = C.CUDNN_ACTIVATION_TANH
-	ClippedReLU ActivationMode = C.CUDNN_ACTIVATION_CLIPPED_RELU
-	Elu         ActivationMode = C.CUDNN_ACTIVATION_ELU
+	StandardRNN       RNNAlgo = C.CUDNN_RNN_ALGO_STANDARD
+	PersistStaticRNN  RNNAlgo = C.CUDNN_RNN_ALGO_PERSIST_STATIC
+	PersistDynamicRNN RNNAlgo = C.CUDNN_RNN_ALGO_PERSIST_DYNAMIC
+	CountRNN          RNNAlgo = C.CUDNN_RNN_ALGO_COUNT
 )
 
-// C returns the C representation of ActivationMode
-func (e ActivationMode) C() C.cudnnActivationMode_t { return C.cudnnActivationMode_t(e) }
-
-//go:generate stringer -type=LRNMode
+// C returns the C representation of RNNAlgo
+func (e RNNAlgo) C() C.cudnnRNNAlgo_t { return C.cudnnRNNAlgo_t(e) }
 
-type LRNMode int
+type RNNBiasMode int
 
 const (
-	CrossChannelDim1 LRNMode = C.CUDNN_LRN_CROSS_CHANNEL_DIM1
+	RNNNoBias        RNNBiasMode = C.CUDNN_RNN_NO_BIAS
+	RNNSingleInpBias RNNBiasMode = C.CUDNN_RNN_SINGLE_INP_BIAS
+	RNNDoubleBias    RNNBiasMode = C.CUDNN_RNN_DOUBLE_BIAS
+	RNNSingleRecBias RNNBiasMode = C.CUDNN_RNN_SINGLE_REC_BIAS
 )
 
-// C returns the C representation of LRNMode
-func (e LRNMode) C() C.cudnnLRNMode_t { return C.cudnnLRNMode_t(e) }
-
-//go:generate stringer -type=DivNormMode
+// C returns the C representation of RNNBiasMode
+func (e RNNBiasMode) C() C.cudnnRNNBiasMode_t { return C.cudnnRNNBiasMode_t(e) }
 
-type DivNormMode int
+type RNNClipMode int
 
 const (
-	PrecomputedMeans DivNormMode = C.CUDNN_DIVNORM_PRECOMPUTED_MEANS
+	RNNClipNone   RNNClipMode = C.CUDNN_RNN_CLIP_NONE
+	RNNClipMinmax RNNClipMode = C.CUDNN_RNN_CLIP_MINMAX
 )
 
-// C returns the C representation of DivNormMode
-func (e DivNormMode) C() C.cudnnDivNormMode_t { return C.cudnnDivNormMode_t(e) }
-
-//go:generate stringer -type=BatchNormMode
+// C returns the C representation of RNNClipMode
+func (e RNNClipMode) C() C.cudnnRNNClipMode_t { return C.cudnnRNNClipMode_t(e) }
 
-type BatchNormMode int
+type RNNDataLayout int
 
 const (
-	PerActivation     BatchNormMode = C.CUDNN_BATCHNORM_PER_ACTIVATION
-	Spatial           BatchNormMode = C.CUDNN_BATCHNORM_SPATIAL
-	SpatialPersistent BatchNormMode = C.CUDNN_BATCHNORM_SPATIAL_PERSISTENT
+	SeqMajorUnpacked   RNNDataLayout = C.CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED
+	SeqMajorPacked     RNNDataLayout = C.CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED
+	BatchMajorUnpacked RNNDataLayout = C.CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED
 )
 
-// C returns the C representation of BatchNormMode
-func (e BatchNormMode) C() C.cudnnBatchNormMode_t { return C.cudnnBatchNormMode_t(e) }
+// C returns the C representation of RNNDataLayout
+func (e RNNDataLayout) C() C.cudnnRNNDataLayout_t { return C.cudnnRNNDataLayout_t(e) }
 
-//go:generate stringer -type=SamplerType
-
-type SamplerType int
+type RNNInputMode int
 
 const (
-	Bilinear SamplerType = C.CUDNN_SAMPLER_BILINEAR
+	LinearInput RNNInputMode = C.CUDNN_LINEAR_INPUT
+	SkipInput   RNNInputMode = C.CUDNN_SKIP_INPUT
 )
 
-// C returns the C representation of SamplerType
-func (e SamplerType) C() C.cudnnSamplerType_t { return C.cudnnSamplerType_t(e) }
-
-//go:generate stringer -type=RNNMode
+// C returns the C representation of RNNInputMode
+func (e RNNInputMode) C() C.cudnnRNNInputMode_t { return C.cudnnRNNInputMode_t(e) }
 
 type RNNMode int
 
@@ -258,51 +688,114 @@ const (
 // C returns the C representation of RNNMode
 func (e RNNMode) C() C.cudnnRNNMode_t { return C.cudnnRNNMode_t(e) }
 
-//go:generate stringer -type=DirectionMode
+type ReduceTensorIndices int
+
+const (
+	ReduceNoIndices        ReduceTensorIndices = C.CUDNN_REDUCE_TENSOR_NO_INDICES
+	ReduceFlattenedIndices ReduceTensorIndices = C.CUDNN_REDUCE_TENSOR_FLATTENED_INDICES
+)
 
-type DirectionMode int
+// C returns the C representation of ReduceTensorIndices
+func (e ReduceTensorIndices) C() C.cudnnReduceTensorIndices_t { return C.cudnnReduceTensorIndices_t(e) }
+
+type ReduceTensorOp int
 
 const (
-	Unidirectional DirectionMode = C.CUDNN_UNIDIRECTIONAL
-	Bidirectional  DirectionMode = C.CUDNN_BIDIRECTIONAL
+	ReduceAdd        ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_ADD
+	ReduceMul        ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_MUL
+	ReduceMin        ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_MIN
+	ReduceMax        ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_MAX
+	ReduceAmax       ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_AMAX
+	ReduceAvg        ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_AVG
+	ReduceNorm1      ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_NORM1
+	ReduceNorm2      ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_NORM2
+	ReduceMulNoZeros ReduceTensorOp = C.CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS
 )
 
-// C returns the C representation of DirectionMode
-func (e DirectionMode) C() C.cudnnDirectionMode_t { return C.cudnnDirectionMode_t(e) }
+// C returns the C representation of ReduceTensorOp
+func (e ReduceTensorOp) C() C.cudnnReduceTensorOp_t { return C.cudnnReduceTensorOp_t(e) }
 
-//go:generate stringer -type=RNNInputMode
+type ReorderType int
 
-type RNNInputMode int
+const (
+	DefaultReorder ReorderType = C.CUDNN_DEFAULT_REORDER
+	NoReorder      ReorderType = C.CUDNN_NO_REORDER
+)
+
+// C returns the C representation of ReorderType
+func (e ReorderType) C() C.cudnnReorderType_t { return C.cudnnReorderType_t(e) }
+
+type SamplerType int
 
 const (
-	LinearInput RNNInputMode = C.CUDNN_LINEAR_INPUT
-	SkipInput   RNNInputMode = C.CUDNN_SKIP_INPUT
+	Bilinear SamplerType = C.CUDNN_SAMPLER_BILINEAR
 )
 
-// C returns the C representation of RNNInputMode
-func (e RNNInputMode) C() C.cudnnRNNInputMode_t { return C.cudnnRNNInputMode_t(e) }
+// C returns the C representation of SamplerType
+func (e SamplerType) C() C.cudnnSamplerType_t { return C.cudnnSamplerType_t(e) }
 
-//go:generate stringer -type=RNNAlgo
+type SeqDataAxis int
 
-type RNNAlgo int
+const (
+	TimeDim  SeqDataAxis = C.CUDNN_SEQDATA_TIME_DIM
+	BatchDim SeqDataAxis = C.CUDNN_SEQDATA_BATCH_DIM
+	BeamDim  SeqDataAxis = C.CUDNN_SEQDATA_BEAM_DIM
+	VectDim  SeqDataAxis = C.CUDNN_SEQDATA_VECT_DIM
+)
+
+// C returns the C representation of SeqDataAxis
+func (e SeqDataAxis) C() C.cudnnSeqDataAxis_t { return C.cudnnSeqDataAxis_t(e) }
+
+type Severity int
 
 const (
-	Standard       RNNAlgo = C.CUDNN_RNN_ALGO_STANDARD
-	PersistStatic  RNNAlgo = C.CUDNN_RNN_ALGO_PERSIST_STATIC
-	PersistDynamic RNNAlgo = C.CUDNN_RNN_ALGO_PERSIST_DYNAMIC
+	Fatal   Severity = C.CUDNN_SEV_FATAL
+	Error   Severity = C.CUDNN_SEV_ERROR
+	Warning Severity = C.CUDNN_SEV_WARNING
+	Info    Severity = C.CUDNN_SEV_INFO
 )
 
-// C returns the C representation of RNNAlgo
-func (e RNNAlgo) C() C.cudnnRNNAlgo_t { return C.cudnnRNNAlgo_t(e) }
+// C returns the C representation of Severity
+func (e Severity) C() C.cudnnSeverity_t { return C.cudnnSeverity_t(e) }
 
-//go:generate stringer -type=CTCLossAlgo
+type SoftmaxAlgorithm int
 
-type CTCLossAlgo int
+const (
+	Fast     SoftmaxAlgorithm = C.CUDNN_SOFTMAX_FAST
+	Accurate SoftmaxAlgorithm = C.CUDNN_SOFTMAX_ACCURATE
+	Log      SoftmaxAlgorithm = C.CUDNN_SOFTMAX_LOG
+)
+
+// C returns the C representation of SoftmaxAlgorithm
+func (e SoftmaxAlgorithm) C() C.cudnnSoftmaxAlgorithm_t { return C.cudnnSoftmaxAlgorithm_t(e) }
+
+type SoftmaxMode int
 
 const (
-	DeterministicCTCLoss    CTCLossAlgo = C.CUDNN_CTC_LOSS_ALGO_DETERMINISTIC
-	NonDeterministicCTCLoss CTCLossAlgo = C.CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC
+	Instance SoftmaxMode = C.CUDNN_SOFTMAX_MODE_INSTANCE
+	Channel  SoftmaxMode = C.CUDNN_SOFTMAX_MODE_CHANNEL
 )
 
-// C returns the C representation of CTCLossAlgo
-func (e CTCLossAlgo) C() C.cudnnCTCLossAlgo_t { return C.cudnnCTCLossAlgo_t(e) }
+// C returns the C representation of SoftmaxMode
+func (e SoftmaxMode) C() C.cudnnSoftmaxMode_t { return C.cudnnSoftmaxMode_t(e) }
+
+type TensorFormat int
+
+const (
+	NCHW      TensorFormat = C.CUDNN_TENSOR_NCHW
+	NHWC      TensorFormat = C.CUDNN_TENSOR_NHWC
+	NCHWVectC TensorFormat = C.CUDNN_TENSOR_NCHW_VECT_C
+)
+
+// C returns the C representation of TensorFormat
+func (e TensorFormat) C() C.cudnnTensorFormat_t { return C.cudnnTensorFormat_t(e) }
+
+type WgradMode int
+
+const (
+	Add WgradMode = C.CUDNN_WGRAD_MODE_ADD
+	Set WgradMode = C.CUDNN_WGRAD_MODE_SET
+)
+
+// C returns the C representation of WgradMode
+func (e WgradMode) C() C.cudnnWgradMode_t { return C.cudnnWgradMode_t(e) }
diff --git a/dnn/generated_enums_strings.go b/dnn/generated_enums_strings.go
index 3f4f3f0..80099a8 100644
--- a/dnn/generated_enums_strings.go
+++ b/dnn/generated_enums_strings.go
@@ -5,38 +5,246 @@ package cudnn
 // #include <cudnn.h>
 import "C"
 
-var _ErrQueryModeNames = map[ErrQueryMode]string{
-	Rawcode:     "Rawcode",
-	Nonblocking: "Nonblocking",
-	Blocking:    "Blocking",
+var _ActivationModeNames = map[ActivationMode]string{
+	Sigmoid:     "Sigmoid",
+	ReLU:        "ReLU",
+	Tanh:        "Tanh",
+	ClippedReLU: "ClippedReLU",
+	Elu:         "Elu",
+	Identity:    "Identity",
 }
 
-func (e ErrQueryMode) String() string { return _ErrQueryModeNames[e] }
+func (e ActivationMode) String() string { return _ActivationModeNames[e] }
 
-var _DataTypeNames = map[DataType]string{
-	Float:  "Float",
-	Double: "Double",
-	Half:   "Half",
-	Int8:   "Int8",
-	Int32:  "Int32",
-	Int8x4: "Int8x4",
+var _BackendAttributeNameNames = map[BackendAttributeName]string{
+	BEAttrNamePointwiseMode:                         "BEAttrNamePointwiseMode",
+	BEAttrNamePointwiseMathPrec:                     "BEAttrNamePointwiseMathPrec",
+	BEAttrNamePointwiseNanPropagation:               "BEAttrNamePointwiseNanPropagation",
+	BEAttrNamePointwiseReluLowerClip:                "BEAttrNamePointwiseReluLowerClip",
+	BEAttrNamePointwiseReluUpperClip:                "BEAttrNamePointwiseReluUpperClip",
+	BEAttrNameConvolutionCompType:                   "BEAttrNameConvolutionCompType",
+	BEAttrNameConvolutionConvMode:                   "BEAttrNameConvolutionConvMode",
+	BEAttrNameConvolutionDilations:                  "BEAttrNameConvolutionDilations",
+	BEAttrNameConvolutionFilterStrides:              "BEAttrNameConvolutionFilterStrides",
+	BEAttrNameConvolutionPostPaddings:               "BEAttrNameConvolutionPostPaddings",
+	BEAttrNameConvolutionPrePaddings:                "BEAttrNameConvolutionPrePaddings",
+	BEAttrNameConvolutionSpatialDims:                "BEAttrNameConvolutionSpatialDims",
+	BEAttrNameEngineheurMode:                        "BEAttrNameEngineheurMode",
+	BEAttrNameEngineheurOperationGraph:              "BEAttrNameEngineheurOperationGraph",
+	BEAttrNameEngineheurResults:                     "BEAttrNameEngineheurResults",
+	BEAttrNameEnginecfgEngine:                       "BEAttrNameEnginecfgEngine",
+	BEAttrNameEnginecfgIntermediateInfo:             "BEAttrNameEnginecfgIntermediateInfo",
+	BEAttrNameEnginecfgKnobChoices:                  "BEAttrNameEnginecfgKnobChoices",
+	BEAttrNameExecutionPlanHandle:                   "BEAttrNameExecutionPlanHandle",
+	BEAttrNameExecutionPlanEngineConfig:             "BEAttrNameExecutionPlanEngineConfig",
+	BEAttrNameExecutionPlanWorkspaceSize:            "BEAttrNameExecutionPlanWorkspaceSize",
+	BEAttrNameExecutionPlanComputedIntermediateUids: "BEAttrNameExecutionPlanComputedIntermediateUids",
+	BEAttrNameExecutionPlanRunOnlyIntermediateUids:  "BEAttrNameExecutionPlanRunOnlyIntermediateUids",
+	BEAttrNameIntermediateInfoUniqueId:              "BEAttrNameIntermediateInfoUniqueId",
+	BEAttrNameIntermediateInfoSize:                  "BEAttrNameIntermediateInfoSize",
+	BEAttrNameIntermediateInfoDependentDataUids:     "BEAttrNameIntermediateInfoDependentDataUids",
+	BEAttrNameIntermediateInfoDependentAttributes:   "BEAttrNameIntermediateInfoDependentAttributes",
+	BEAttrNameKnobChoiceKnobType:                    "BEAttrNameKnobChoiceKnobType",
+	BEAttrNameKnobChoiceKnobValue:                   "BEAttrNameKnobChoiceKnobValue",
+	BEAttrNameOperationConvolutionForwardAlpha:      "BEAttrNameOperationConvolutionForwardAlpha",
+	BEAttrNameOperationConvolutionForwardBeta:       "BEAttrNameOperationConvolutionForwardBeta",
+	BEAttrNameOperationConvolutionForwardConvDesc:   "BEAttrNameOperationConvolutionForwardConvDesc",
+	BEAttrNameOperationConvolutionForwardW:          "BEAttrNameOperationConvolutionForwardW",
+	BEAttrNameOperationConvolutionForwardX:          "BEAttrNameOperationConvolutionForwardX",
+	BEAttrNameOperationConvolutionForwardY:          "BEAttrNameOperationConvolutionForwardY",
+	BEAttrNameOperationConvolutionBwdDataAlpha:      "BEAttrNameOperationConvolutionBwdDataAlpha",
+	BEAttrNameOperationConvolutionBwdDataBeta:       "BEAttrNameOperationConvolutionBwdDataBeta",
+	BEAttrNameOperationConvolutionBwdDataConvDesc:   "BEAttrNameOperationConvolutionBwdDataConvDesc",
+	BEAttrNameOperationConvolutionBwdDataW:          "BEAttrNameOperationConvolutionBwdDataW",
+	BEAttrNameOperationConvolutionBwdDataDx:         "BEAttrNameOperationConvolutionBwdDataDx",
+	BEAttrNameOperationConvolutionBwdDataDy:         "BEAttrNameOperationConvolutionBwdDataDy",
+	BEAttrNameOperationConvolutionBwdFilterAlpha:    "BEAttrNameOperationConvolutionBwdFilterAlpha",
+	BEAttrNameOperationConvolutionBwdFilterBeta:     "BEAttrNameOperationConvolutionBwdFilterBeta",
+	BEAttrNameOperationConvolutionBwdFilterConvDesc: "BEAttrNameOperationConvolutionBwdFilterConvDesc",
+	BEAttrNameOperationConvolutionBwdFilterDw:       "BEAttrNameOperationConvolutionBwdFilterDw",
+	BEAttrNameOperationConvolutionBwdFilterX:        "BEAttrNameOperationConvolutionBwdFilterX",
+	BEAttrNameOperationConvolutionBwdFilterDy:       "BEAttrNameOperationConvolutionBwdFilterDy",
+	BEAttrNameOperationPointwisePwDescriptor:        "BEAttrNameOperationPointwisePwDescriptor",
+	BEAttrNameOperationPointwiseXdesc:               "BEAttrNameOperationPointwiseXdesc",
+	BEAttrNameOperationPointwiseBdesc:               "BEAttrNameOperationPointwiseBdesc",
+	BEAttrNameOperationPointwiseYdesc:               "BEAttrNameOperationPointwiseYdesc",
+	BEAttrNameOperationPointwiseAlpha1:              "BEAttrNameOperationPointwiseAlpha1",
+	BEAttrNameOperationPointwiseAlpha2:              "BEAttrNameOperationPointwiseAlpha2",
+	BEAttrNameOperationGenstatsMode:                 "BEAttrNameOperationGenstatsMode",
+	BEAttrNameOperationGenstatsMathPrec:             "BEAttrNameOperationGenstatsMathPrec",
+	BEAttrNameOperationGenstatsXdesc:                "BEAttrNameOperationGenstatsXdesc",
+	BEAttrNameOperationGenstatsSumdesc:              "BEAttrNameOperationGenstatsSumdesc",
+	BEAttrNameOperationGenstatsSqsumdesc:            "BEAttrNameOperationGenstatsSqsumdesc",
+	BEAttrNameOperationgraphHandle:                  "BEAttrNameOperationgraphHandle",
+	BEAttrNameOperationgraphOps:                     "BEAttrNameOperationgraphOps",
+	BEAttrNameOperationgraphEngineGlobalCount:       "BEAttrNameOperationgraphEngineGlobalCount",
+	BEAttrNameTensorByteAlignment:                   "BEAttrNameTensorByteAlignment",
+	BEAttrNameTensorDataType:                        "BEAttrNameTensorDataType",
+	BEAttrNameTensorDimensions:                      "BEAttrNameTensorDimensions",
+	BEAttrNameTensorStrides:                         "BEAttrNameTensorStrides",
+	BEAttrNameTensorVectorCount:                     "BEAttrNameTensorVectorCount",
+	BEAttrNameTensorVectorizedDimension:             "BEAttrNameTensorVectorizedDimension",
+	BEAttrNameTensorUniqueId:                        "BEAttrNameTensorUniqueId",
+	BEAttrNameTensorIsVirtual:                       "BEAttrNameTensorIsVirtual",
+	BEAttrNameVariantPackUniqueIds:                  "BEAttrNameVariantPackUniqueIds",
+	BEAttrNameVariantPackDataPointers:               "BEAttrNameVariantPackDataPointers",
+	BEAttrNameVariantPackIntermediates:              "BEAttrNameVariantPackIntermediates",
+	BEAttrNameVariantPackWorkspace:                  "BEAttrNameVariantPackWorkspace",
+	BEAttrNameLayoutInfoTensorUid:                   "BEAttrNameLayoutInfoTensorUid",
+	BEAttrNameLayoutInfoTypes:                       "BEAttrNameLayoutInfoTypes",
+	BEAttrNameKnobInfoType:                          "BEAttrNameKnobInfoType",
+	BEAttrNameKnobInfoMaximumValue:                  "BEAttrNameKnobInfoMaximumValue",
+	BEAttrNameKnobInfoMinimumValue:                  "BEAttrNameKnobInfoMinimumValue",
+	BEAttrNameKnobInfoStride:                        "BEAttrNameKnobInfoStride",
+	BEAttrNameEngineOperationGraph:                  "BEAttrNameEngineOperationGraph",
+	BEAttrNameEngineGlobalIndex:                     "BEAttrNameEngineGlobalIndex",
+	BEAttrNameEngineKnobInfo:                        "BEAttrNameEngineKnobInfo",
+	BEAttrNameEngineNumericalNote:                   "BEAttrNameEngineNumericalNote",
+	BEAttrNameEngineLayoutInfo:                      "BEAttrNameEngineLayoutInfo",
 }
 
-func (e DataType) String() string { return _DataTypeNames[e] }
+func (e BackendAttributeName) String() string { return _BackendAttributeNameNames[e] }
+
+var _BackendAttributeTypeNames = map[BackendAttributeType]string{
+	BEAttrHandle:            "BEAttrHandle",
+	BEAttrDataType:          "BEAttrDataType",
+	BEAttrBoolean:           "BEAttrBoolean",
+	BEAttrInt64:             "BEAttrInt64",
+	BEAttrFloat:             "BEAttrFloat",
+	BEAttrDouble:            "BEAttrDouble",
+	BEAttrVoidPtr:           "BEAttrVoidPtr",
+	BEAttrConvolutionMode:   "BEAttrConvolutionMode",
+	BEAttrHeurMode:          "BEAttrHeurMode",
+	BEAttrKnobType:          "BEAttrKnobType",
+	BEAttrNanPropogation:    "BEAttrNanPropogation",
+	BEAttrNumericalNote:     "BEAttrNumericalNote",
+	BEAttrLayoutType:        "BEAttrLayoutType",
+	BEAttrAttribName:        "BEAttrAttribName",
+	BEAttrPointwiseMode:     "BEAttrPointwiseMode",
+	BEAttrBackendDescriptor: "BEAttrBackendDescriptor",
+	BEAttrGenstatsMode:      "BEAttrGenstatsMode",
+}
 
-var _MathTypeNames = map[MathType]string{
-	DefaultMath:  "DefaultMath",
-	TensorOpMath: "TensorOpMath",
+func (e BackendAttributeType) String() string { return _BackendAttributeTypeNames[e] }
+
+var _BackendDescriptorTypeNames = map[BackendDescriptorType]string{
+	BEDescriptorPointwiseDescriptor:                          "BEDescriptorPointwiseDescriptor",
+	BEDescriptorConvolutionDescriptor:                        "BEDescriptorConvolutionDescriptor",
+	BEDescriptorEngineDescriptor:                             "BEDescriptorEngineDescriptor",
+	BEDescriptorEnginecfgDescriptor:                          "BEDescriptorEnginecfgDescriptor",
+	BEDescriptorEngineheurDescriptor:                         "BEDescriptorEngineheurDescriptor",
+	BEDescriptorExecutionPlanDescriptor:                      "BEDescriptorExecutionPlanDescriptor",
+	BEDescriptorIntermediateInfoDescriptor:                   "BEDescriptorIntermediateInfoDescriptor",
+	BEDescriptorKnobChoiceDescriptor:                         "BEDescriptorKnobChoiceDescriptor",
+	BEDescriptorKnobInfoDescriptor:                           "BEDescriptorKnobInfoDescriptor",
+	BEDescriptorLayoutInfoDescriptor:                         "BEDescriptorLayoutInfoDescriptor",
+	BEDescriptorOperationConvolutionForwardDescriptor:        "BEDescriptorOperationConvolutionForwardDescriptor",
+	BEDescriptorOperationConvolutionBackwardFilterDescriptor: "BEDescriptorOperationConvolutionBackwardFilterDescriptor",
+	BEDescriptorOperationConvolutionBackwardDataDescriptor:   "BEDescriptorOperationConvolutionBackwardDataDescriptor",
+	BEDescriptorOperationPointwiseDescriptor:                 "BEDescriptorOperationPointwiseDescriptor",
+	BEDescriptorOperationGenStatsDescriptor:                  "BEDescriptorOperationGenStatsDescriptor",
+	BEDescriptorOperationgraphDescriptor:                     "BEDescriptorOperationgraphDescriptor",
+	BEDescriptorVariantPackDescriptor:                        "BEDescriptorVariantPackDescriptor",
+	BEDescriptorTensorDescriptor:                             "BEDescriptorTensorDescriptor",
 }
 
-func (e MathType) String() string { return _MathTypeNames[e] }
+func (e BackendDescriptorType) String() string { return _BackendDescriptorTypeNames[e] }
 
-var _NanPropagationNames = map[NanPropagation]string{
-	NotPropagateNan: "NotPropagateNan",
-	PropagateNan:    "PropagateNan",
+var _BackendHeurModeNames = map[BackendHeurMode]string{
+	Instant: "Instant",
+	SCount:  "SCount",
 }
 
-func (e NanPropagation) String() string { return _NanPropagationNames[e] }
+func (e BackendHeurMode) String() string { return _BackendHeurModeNames[e] }
+
+var _BackendKnobTypeNames = map[BackendKnobType]string{
+	SplitK:        "SplitK",
+	Swizzle:       "Swizzle",
+	TileSize:      "TileSize",
+	UseTex:        "UseTex",
+	Edge:          "Edge",
+	Kblock:        "Kblock",
+	Ldga:          "Ldga",
+	Ldgb:          "Ldgb",
+	ChunkK:        "ChunkK",
+	SplitH:        "SplitH",
+	WinoTile:      "WinoTile",
+	Multiply:      "Multiply",
+	SplitKBuf:     "SplitKBuf",
+	Tilek:         "Tilek",
+	Stages:        "Stages",
+	ReductionMode: "ReductionMode",
+	CtaSplitKMode: "CtaSplitKMode",
+	SplitKSlc:     "SplitKSlc",
+	IdxMode:       "IdxMode",
+	Sliced:        "Sliced",
+	SplitRs:       "SplitRs",
+	Singlebuffer:  "Singlebuffer",
+	Ldgc:          "Ldgc",
+	Specfilt:      "Specfilt",
+	Counts:        "Counts",
+}
+
+func (e BackendKnobType) String() string { return _BackendKnobTypeNames[e] }
+
+var _BackendLayoutTypeNames = map[BackendLayoutType]string{
+	BELayoutPreferredNchw:   "BELayoutPreferredNchw",
+	BELayoutPreferredNhwc:   "BELayoutPreferredNhwc",
+	BELayoutPreferredPad4ck: "BELayoutPreferredPad4ck",
+	BELayoutPreferredPad8ck: "BELayoutPreferredPad8ck",
+	BELayoutCount:           "BELayoutCount",
+}
+
+func (e BackendLayoutType) String() string { return _BackendLayoutTypeNames[e] }
+
+var _BackendNumericalNoteNames = map[BackendNumericalNote]string{
+	TensorCore:                "TensorCore",
+	DownConvertInputs:         "DownConvertInputs",
+	ReducedPrecisionReduction: "ReducedPrecisionReduction",
+	Fft:                       "Fft",
+	Nondeterministic:          "Nondeterministic",
+	Winograd:                  "Winograd",
+	TypeCount:                 "TypeCount",
+}
+
+func (e BackendNumericalNote) String() string { return _BackendNumericalNoteNames[e] }
+
+var _BatchNormModeNames = map[BatchNormMode]string{
+	PerActivation:     "PerActivation",
+	Spatial:           "Spatial",
+	SpatialPersistent: "SpatialPersistent",
+}
+
+func (e BatchNormMode) String() string { return _BatchNormModeNames[e] }
+
+var _BatchNormOpsNames = map[BatchNormOps]string{
+	BatchNorm:              "BatchNorm",
+	BatchNormActivation:    "BatchNormActivation",
+	BatchNormAddActivation: "BatchNormAddActivation",
+}
+
+func (e BatchNormOps) String() string { return _BatchNormOpsNames[e] }
+
+var _CTCLossAlgoNames = map[CTCLossAlgo]string{
+	DeterministicCTCLoss:    "DeterministicCTCLoss",
+	NonDeterministicCTCLoss: "NonDeterministicCTCLoss",
+}
+
+func (e CTCLossAlgo) String() string { return _CTCLossAlgoNames[e] }
+
+var _DataTypeNames = map[DataType]string{
+	Float:   "Float",
+	Double:  "Double",
+	Half:    "Half",
+	Int8:    "Int8",
+	Int32:   "Int32",
+	Int8x4:  "Int8x4",
+	Uint8:   "Uint8",
+	Uint8x4: "Uint8x4",
+	Int8x32: "Int8x32",
+}
+
+func (e DataType) String() string { return _DataTypeNames[e] }
 
 var _DeterminismNames = map[Determinism]string{
 	NonDeterministic: "NonDeterministic",
@@ -45,45 +253,143 @@ var _DeterminismNames = map[Determinism]string{
 
 func (e Determinism) String() string { return _DeterminismNames[e] }
 
-var _TensorFormatNames = map[TensorFormat]string{
-	NCHW:      "NCHW",
-	NHWC:      "NHWC",
-	NCHWVectC: "NCHWVectC",
+var _DirectionModeNames = map[DirectionMode]string{
+	Unidirectional: "Unidirectional",
+	Bidirectional:  "Bidirectional",
 }
 
-func (e TensorFormat) String() string { return _TensorFormatNames[e] }
+func (e DirectionMode) String() string { return _DirectionModeNames[e] }
 
-var _OpTensorOpNames = map[OpTensorOp]string{
-	Add:  "Add",
-	Mul:  "Mul",
-	Min:  "Min",
-	Max:  "Max",
-	Sqrt: "Sqrt",
-	Not:  "Not",
+var _DivNormModeNames = map[DivNormMode]string{
+	PrecomputedMeans: "PrecomputedMeans",
 }
 
-func (e OpTensorOp) String() string { return _OpTensorOpNames[e] }
+func (e DivNormMode) String() string { return _DivNormModeNames[e] }
 
-var _ReduceTensorOpNames = map[ReduceTensorOp]string{
-	ReduceAdd:        "ReduceAdd",
-	ReduceMul:        "ReduceMul",
-	ReduceMin:        "ReduceMin",
-	ReduceMax:        "ReduceMax",
-	ReduceAmax:       "ReduceAmax",
-	ReduceAvg:        "ReduceAvg",
-	ReduceNorm1:      "ReduceNorm1",
-	ReduceNorm2:      "ReduceNorm2",
-	ReduceMulNoZeros: "ReduceMulNoZeros",
+var _ErrQueryModeNames = map[ErrQueryMode]string{
+	Rawcode:     "Rawcode",
+	Nonblocking: "Nonblocking",
+	Blocking:    "Blocking",
 }
 
-func (e ReduceTensorOp) String() string { return _ReduceTensorOpNames[e] }
+func (e ErrQueryMode) String() string { return _ErrQueryModeNames[e] }
 
-var _ReduceTensorIndicesNames = map[ReduceTensorIndices]string{
-	ReduceNoIndices:        "ReduceNoIndices",
-	ReduceFlattenedIndices: "ReduceFlattenedIndices",
+var _FoldingDirectionNames = map[FoldingDirection]string{
+	Fold:   "Fold",
+	Unfold: "Unfold",
 }
 
-func (e ReduceTensorIndices) String() string { return _ReduceTensorIndicesNames[e] }
+func (e FoldingDirection) String() string { return _FoldingDirectionNames[e] }
+
+var _ForwardModeNames = map[ForwardMode]string{
+	Inference: "Inference",
+	Training:  "Training",
+}
+
+func (e ForwardMode) String() string { return _ForwardModeNames[e] }
+
+var _FusedOpsConstParamLabelNames = map[FusedOpsConstParamLabel]string{
+	Xdesc:                        "Xdesc",
+	XdataPlaceholder:             "XdataPlaceholder",
+	BnMode:                       "BnMode",
+	BnEqscalebiasDesc:            "BnEqscalebiasDesc",
+	BnEqscalePlaceholder:         "BnEqscalePlaceholder",
+	BnEqbiasPlaceholder:          "BnEqbiasPlaceholder",
+	ActivationDesc:               "ActivationDesc",
+	ConvDesc:                     "ConvDesc",
+	Wdesc:                        "Wdesc",
+	WdataPlaceholder:             "WdataPlaceholder",
+	Dwdesc:                       "Dwdesc",
+	DwdataPlaceholder:            "DwdataPlaceholder",
+	Ydesc:                        "Ydesc",
+	YdataPlaceholder:             "YdataPlaceholder",
+	Dydesc:                       "Dydesc",
+	DydataPlaceholder:            "DydataPlaceholder",
+	YstatsDesc:                   "YstatsDesc",
+	YsumPlaceholder:              "YsumPlaceholder",
+	YsqsumPlaceholder:            "YsqsumPlaceholder",
+	BnScalebiasMeanvarDesc:       "BnScalebiasMeanvarDesc",
+	BnScalePlaceholder:           "BnScalePlaceholder",
+	BnBiasPlaceholder:            "BnBiasPlaceholder",
+	BnSavedMeanPlaceholder:       "BnSavedMeanPlaceholder",
+	BnSavedInvstdPlaceholder:     "BnSavedInvstdPlaceholder",
+	BnRunningMeanPlaceholder:     "BnRunningMeanPlaceholder",
+	BnRunningVarPlaceholder:      "BnRunningVarPlaceholder",
+	Zdesc:                        "Zdesc",
+	ZdataPlaceholder:             "ZdataPlaceholder",
+	BnZEqscalebiasDesc:           "BnZEqscalebiasDesc",
+	BnZEqscalePlaceholder:        "BnZEqscalePlaceholder",
+	BnZEqbiasPlaceholder:         "BnZEqbiasPlaceholder",
+	ActivationBitmaskDesc:        "ActivationBitmaskDesc",
+	ActivationBitmaskPlaceholder: "ActivationBitmaskPlaceholder",
+	Dxdesc:                       "Dxdesc",
+	DxdataPlaceholder:            "DxdataPlaceholder",
+	Dzdesc:                       "Dzdesc",
+	DzdataPlaceholder:            "DzdataPlaceholder",
+	BnDscalePlaceholder:          "BnDscalePlaceholder",
+	BnDbiasPlaceholder:           "BnDbiasPlaceholder",
+}
+
+func (e FusedOpsConstParamLabel) String() string { return _FusedOpsConstParamLabelNames[e] }
+
+var _FusedOpsPointerPlaceHolderNames = map[FusedOpsPointerPlaceHolder]string{
+	NullPtr:        "NullPtr",
+	PtrElemAligned: "PtrElemAligned",
+	Ptr16:          "Ptr16",
+}
+
+func (e FusedOpsPointerPlaceHolder) String() string { return _FusedOpsPointerPlaceHolderNames[e] }
+
+var _FusedOpsVariantParamLabelNames = map[FusedOpsVariantParamLabel]string{
+	PtrXdata:                        "PtrXdata",
+	PtrBnEqscale:                    "PtrBnEqscale",
+	PtrBnEqbias:                     "PtrBnEqbias",
+	PtrWdata:                        "PtrWdata",
+	PtrDwdata:                       "PtrDwdata",
+	PtrYdata:                        "PtrYdata",
+	PtrDydata:                       "PtrDydata",
+	PtrYsum:                         "PtrYsum",
+	PtrYsqsum:                       "PtrYsqsum",
+	PtrWorkspace:                    "PtrWorkspace",
+	PtrBnScale:                      "PtrBnScale",
+	PtrBnBias:                       "PtrBnBias",
+	PtrBnSavedMean:                  "PtrBnSavedMean",
+	PtrBnSavedInvstd:                "PtrBnSavedInvstd",
+	PtrBnRunningMean:                "PtrBnRunningMean",
+	PtrBnRunningVar:                 "PtrBnRunningVar",
+	PtrZdata:                        "PtrZdata",
+	PtrBnZEqscale:                   "PtrBnZEqscale",
+	PtrBnZEqbias:                    "PtrBnZEqbias",
+	PtrActivationBitmask:            "PtrActivationBitmask",
+	PtrDxdata:                       "PtrDxdata",
+	PtrDzdata:                       "PtrDzdata",
+	PtrBnDscale:                     "PtrBnDscale",
+	PtrBnDbias:                      "PtrBnDbias",
+	ScalarSizeTWorkspaceSizeInBytes: "ScalarSizeTWorkspaceSizeInBytes",
+	ScalarInt64TBnAccumulationCount: "ScalarInt64TBnAccumulationCount",
+	ScalarDoubleBnExpAvgFactor:      "ScalarDoubleBnExpAvgFactor",
+	ScalarDoubleBnEpsilon:           "ScalarDoubleBnEpsilon",
+}
+
+func (e FusedOpsVariantParamLabel) String() string { return _FusedOpsVariantParamLabelNames[e] }
+
+var _FusedOpsNames = map[FusedOps]string{
+	ScaleBiasActivationConvBnstats:   "ScaleBiasActivationConvBnstats",
+	ScaleBiasActivationWgrad:         "ScaleBiasActivationWgrad",
+	BnFinalizeStatisticsTraining:     "BnFinalizeStatisticsTraining",
+	BnFinalizeStatisticsInference:    "BnFinalizeStatisticsInference",
+	ConvScaleBiasAddActivation:       "ConvScaleBiasAddActivation",
+	ScaleBiasAddActivationGenBitmask: "ScaleBiasAddActivationGenBitmask",
+	DactivationForkDbatchnorm:        "DactivationForkDbatchnorm",
+}
+
+func (e FusedOps) String() string { return _FusedOpsNames[e] }
+
+var _GenStatsModeNames = map[GenStatsMode]string{
+	SumSq: "SumSq",
+}
+
+func (e GenStatsMode) String() string { return _GenStatsModeNames[e] }
 
 var _IndicesTypeNames = map[IndicesType]string{
 	Indices32: "Indices32",
@@ -94,20 +400,94 @@ var _IndicesTypeNames = map[IndicesType]string{
 
 func (e IndicesType) String() string { return _IndicesTypeNames[e] }
 
-var _SoftmaxAlgorithmNames = map[SoftmaxAlgorithm]string{
-	Fast:     "Fast",
-	Accurate: "Accurate",
-	Log:      "Log",
+var _LRNModeNames = map[LRNMode]string{
+	CrossChannelDim1: "CrossChannelDim1",
 }
 
-func (e SoftmaxAlgorithm) String() string { return _SoftmaxAlgorithmNames[e] }
+func (e LRNMode) String() string { return _LRNModeNames[e] }
 
-var _SoftmaxModeNames = map[SoftmaxMode]string{
-	Instance: "Instance",
-	Channel:  "Channel",
+var _LossNormalizationModeNames = map[LossNormalizationMode]string{
+	LossNormNone:    "LossNormNone",
+	LossNormSoftmax: "LossNormSoftmax",
 }
 
-func (e SoftmaxMode) String() string { return _SoftmaxModeNames[e] }
+func (e LossNormalizationMode) String() string { return _LossNormalizationModeNames[e] }
+
+var _MathTypeNames = map[MathType]string{
+	DefaultMath:                 "DefaultMath",
+	TensorOpMath:                "TensorOpMath",
+	TensorOpMathAllowConversion: "TensorOpMathAllowConversion",
+	FmaMath:                     "FmaMath",
+}
+
+func (e MathType) String() string { return _MathTypeNames[e] }
+
+var _MultiHeadAttnWeightKindNames = map[MultiHeadAttnWeightKind]string{
+	QWeights: "QWeights",
+	KWeights: "KWeights",
+	VWeights: "VWeights",
+	OWeights: "OWeights",
+	QBiases:  "QBiases",
+	KBiases:  "KBiases",
+	VBiases:  "VBiases",
+	OBiases:  "OBiases",
+}
+
+func (e MultiHeadAttnWeightKind) String() string { return _MultiHeadAttnWeightKindNames[e] }
+
+var _NanPropagationNames = map[NanPropagation]string{
+	NotPropagateNan: "NotPropagateNan",
+	PropagateNan:    "PropagateNan",
+}
+
+func (e NanPropagation) String() string { return _NanPropagationNames[e] }
+
+var _NormAlgoNames = map[NormAlgo]string{
+	StandardNorm: "StandardNorm",
+	PersistNorm:  "PersistNorm",
+}
+
+func (e NormAlgo) String() string { return _NormAlgoNames[e] }
+
+var _NormModeNames = map[NormMode]string{
+	NormPerActivation: "NormPerActivation",
+	NormPerChannel:    "NormPerChannel",
+}
+
+func (e NormMode) String() string { return _NormModeNames[e] }
+
+var _NormOpsNames = map[NormOps]string{
+	Norm:              "Norm",
+	NormActivation:    "NormActivation",
+	NormAddActivation: "NormAddActivation",
+}
+
+func (e NormOps) String() string { return _NormOpsNames[e] }
+
+var _OpTensorOpNames = map[OpTensorOp]string{
+	TensorAdd:  "TensorAdd",
+	TensorMul:  "TensorMul",
+	TensorMin:  "TensorMin",
+	TensorMax:  "TensorMax",
+	TensorSqrt: "TensorSqrt",
+	TensorNot:  "TensorNot",
+}
+
+func (e OpTensorOp) String() string { return _OpTensorOpNames[e] }
+
+var _PointwiseModeNames = map[PointwiseMode]string{
+	PointwiseAdd:        "PointwiseAdd",
+	PointwiseMul:        "PointwiseMul",
+	PointwiseMin:        "PointwiseMin",
+	PointwiseMax:        "PointwiseMax",
+	PointwiseSqrt:       "PointwiseSqrt",
+	PointwiseReluFwd:    "PointwiseReluFwd",
+	PointwiseTanhFwd:    "PointwiseTanhFwd",
+	PointwiseSigmoidFwd: "PointwiseSigmoidFwd",
+	PointwiseEluFwd:     "PointwiseEluFwd",
+}
+
+func (e PointwiseMode) String() string { return _PointwiseModeNames[e] }
 
 var _PoolingModeNames = map[PoolingMode]string{
 	MaxPooling:                 "MaxPooling",
@@ -118,41 +498,45 @@ var _PoolingModeNames = map[PoolingMode]string{
 
 func (e PoolingMode) String() string { return _PoolingModeNames[e] }
 
-var _ActivationModeNames = map[ActivationMode]string{
-	Sigmoid:     "Sigmoid",
-	ReLU:        "ReLU",
-	Tanh:        "Tanh",
-	ClippedReLU: "ClippedReLU",
-	Elu:         "Elu",
+var _RNNAlgoNames = map[RNNAlgo]string{
+	StandardRNN:       "StandardRNN",
+	PersistStaticRNN:  "PersistStaticRNN",
+	PersistDynamicRNN: "PersistDynamicRNN",
+	CountRNN:          "CountRNN",
 }
 
-func (e ActivationMode) String() string { return _ActivationModeNames[e] }
+func (e RNNAlgo) String() string { return _RNNAlgoNames[e] }
 
-var _LRNModeNames = map[LRNMode]string{
-	CrossChannelDim1: "CrossChannelDim1",
+var _RNNBiasModeNames = map[RNNBiasMode]string{
+	RNNNoBias:        "RNNNoBias",
+	RNNSingleInpBias: "RNNSingleInpBias",
+	RNNDoubleBias:    "RNNDoubleBias",
+	RNNSingleRecBias: "RNNSingleRecBias",
 }
 
-func (e LRNMode) String() string { return _LRNModeNames[e] }
+func (e RNNBiasMode) String() string { return _RNNBiasModeNames[e] }
 
-var _DivNormModeNames = map[DivNormMode]string{
-	PrecomputedMeans: "PrecomputedMeans",
+var _RNNClipModeNames = map[RNNClipMode]string{
+	RNNClipNone:   "RNNClipNone",
+	RNNClipMinmax: "RNNClipMinmax",
 }
 
-func (e DivNormMode) String() string { return _DivNormModeNames[e] }
+func (e RNNClipMode) String() string { return _RNNClipModeNames[e] }
 
-var _BatchNormModeNames = map[BatchNormMode]string{
-	PerActivation:     "PerActivation",
-	Spatial:           "Spatial",
-	SpatialPersistent: "SpatialPersistent",
+var _RNNDataLayoutNames = map[RNNDataLayout]string{
+	SeqMajorUnpacked:   "SeqMajorUnpacked",
+	SeqMajorPacked:     "SeqMajorPacked",
+	BatchMajorUnpacked: "BatchMajorUnpacked",
 }
 
-func (e BatchNormMode) String() string { return _BatchNormModeNames[e] }
+func (e RNNDataLayout) String() string { return _RNNDataLayoutNames[e] }
 
-var _SamplerTypeNames = map[SamplerType]string{
-	Bilinear: "Bilinear",
+var _RNNInputModeNames = map[RNNInputMode]string{
+	LinearInput: "LinearInput",
+	SkipInput:   "SkipInput",
 }
 
-func (e SamplerType) String() string { return _SamplerTypeNames[e] }
+func (e RNNInputMode) String() string { return _RNNInputModeNames[e] }
 
 var _RNNModeNames = map[RNNMode]string{
 	RNNReLU: "RNNReLU",
@@ -163,31 +547,84 @@ var _RNNModeNames = map[RNNMode]string{
 
 func (e RNNMode) String() string { return _RNNModeNames[e] }
 
-var _DirectionModeNames = map[DirectionMode]string{
-	Unidirectional: "Unidirectional",
-	Bidirectional:  "Bidirectional",
+var _ReduceTensorIndicesNames = map[ReduceTensorIndices]string{
+	ReduceNoIndices:        "ReduceNoIndices",
+	ReduceFlattenedIndices: "ReduceFlattenedIndices",
 }
 
-func (e DirectionMode) String() string { return _DirectionModeNames[e] }
+func (e ReduceTensorIndices) String() string { return _ReduceTensorIndicesNames[e] }
 
-var _RNNInputModeNames = map[RNNInputMode]string{
-	LinearInput: "LinearInput",
-	SkipInput:   "SkipInput",
+var _ReduceTensorOpNames = map[ReduceTensorOp]string{
+	ReduceAdd:        "ReduceAdd",
+	ReduceMul:        "ReduceMul",
+	ReduceMin:        "ReduceMin",
+	ReduceMax:        "ReduceMax",
+	ReduceAmax:       "ReduceAmax",
+	ReduceAvg:        "ReduceAvg",
+	ReduceNorm1:      "ReduceNorm1",
+	ReduceNorm2:      "ReduceNorm2",
+	ReduceMulNoZeros: "ReduceMulNoZeros",
 }
 
-func (e RNNInputMode) String() string { return _RNNInputModeNames[e] }
+func (e ReduceTensorOp) String() string { return _ReduceTensorOpNames[e] }
 
-var _RNNAlgoNames = map[RNNAlgo]string{
-	Standard:       "Standard",
-	PersistStatic:  "PersistStatic",
-	PersistDynamic: "PersistDynamic",
+var _ReorderTypeNames = map[ReorderType]string{
+	DefaultReorder: "DefaultReorder",
+	NoReorder:      "NoReorder",
 }
 
-func (e RNNAlgo) String() string { return _RNNAlgoNames[e] }
+func (e ReorderType) String() string { return _ReorderTypeNames[e] }
 
-var _CTCLossAlgoNames = map[CTCLossAlgo]string{
-	DeterministicCTCLoss:    "DeterministicCTCLoss",
-	NonDeterministicCTCLoss: "NonDeterministicCTCLoss",
+var _SamplerTypeNames = map[SamplerType]string{
+	Bilinear: "Bilinear",
 }
 
-func (e CTCLossAlgo) String() string { return _CTCLossAlgoNames[e] }
+func (e SamplerType) String() string { return _SamplerTypeNames[e] }
+
+var _SeqDataAxisNames = map[SeqDataAxis]string{
+	TimeDim:  "TimeDim",
+	BatchDim: "BatchDim",
+	BeamDim:  "BeamDim",
+	VectDim:  "VectDim",
+}
+
+func (e SeqDataAxis) String() string { return _SeqDataAxisNames[e] }
+
+var _SeverityNames = map[Severity]string{
+	Fatal:   "Fatal",
+	Error:   "Error",
+	Warning: "Warning",
+	Info:    "Info",
+}
+
+func (e Severity) String() string { return _SeverityNames[e] }
+
+var _SoftmaxAlgorithmNames = map[SoftmaxAlgorithm]string{
+	Fast:     "Fast",
+	Accurate: "Accurate",
+	Log:      "Log",
+}
+
+func (e SoftmaxAlgorithm) String() string { return _SoftmaxAlgorithmNames[e] }
+
+var _SoftmaxModeNames = map[SoftmaxMode]string{
+	Instance: "Instance",
+	Channel:  "Channel",
+}
+
+func (e SoftmaxMode) String() string { return _SoftmaxModeNames[e] }
+
+var _TensorFormatNames = map[TensorFormat]string{
+	NCHW:      "NCHW",
+	NHWC:      "NHWC",
+	NCHWVectC: "NCHWVectC",
+}
+
+func (e TensorFormat) String() string { return _TensorFormatNames[e] }
+
+var _WgradModeNames = map[WgradMode]string{
+	Add: "Add",
+	Set: "Set",
+}
+
+func (e WgradMode) String() string { return _WgradModeNames[e] }
diff --git a/dnn/generated_fusedopconsts.go b/dnn/generated_fusedopconsts.go
new file mode 100644
index 0000000..cc893e1
--- /dev/null
+++ b/dnn/generated_fusedopconsts.go
@@ -0,0 +1,54 @@
+package cudnn
+
+/* WAS Generated by gencudnn. DO NOT EDIT */
+
+// #include <cudnn.h>
+import "C"
+import (
+	"runtime"
+	"unsafe"
+)
+
+// FusedOpConsts is a representation of cudnnFusedOpsConstParamPack_t.
+type FusedOpConsts struct {
+	internal C.cudnnFusedOpsConstParamPack_t
+
+	ops        FusedOps
+	paramLabel FusedOpsConstParamLabel
+	param      Memory
+}
+
+// NewFusedOpConsts creates a new FusedOpConsts.
+func NewFusedOpConsts(ops FusedOps, paramLabel FusedOpsConstParamLabel, param Memory) (retVal *FusedOpConsts, err error) {
+	var internal C.cudnnFusedOpsConstParamPack_t
+	if err := result(C.cudnnCreateFusedOpsConstParamPack(&internal, ops.C())); err != nil {
+		return nil, err
+	}
+
+	if err := result(C.cudnnSetFusedOpsConstParamPackAttribute(internal, paramLabel.C(), unsafe.Pointer(param.Uintptr()))); err != nil {
+		return nil, err
+	}
+
+	retVal = &FusedOpConsts{
+		internal:   internal,
+		paramLabel: paramLabel,
+		param:      param,
+		ops:        ops,
+	}
+	runtime.SetFinalizer(retVal, destroyFusedOpConsts)
+	return retVal, nil
+}
+
+// C returns the internal cgo representation.
+func (f *FusedOpConsts) C() C.cudnnFusedOpsConstParamPack_t { return f.internal }
+
+// Ops returns the ops that were fused.
+func (f *FusedOpConsts) Ops() FusedOps { return f.ops }
+
+// ParamLabel returns the internal paramLabel.
+func (f *FusedOpConsts) ParamLabel() FusedOpsConstParamLabel { return f.paramLabel }
+
+// Param returns the internal param.
+func (f *FusedOpConsts) Param() Memory { return f.param }
+
+func destroyFusedOpConsts(obj *FusedOpConsts) { C.cudnnDestroyFusedOpsConstParamPack(obj.internal) }
diff --git a/dnn/generated_fusedopvariantparams.go b/dnn/generated_fusedopvariantparams.go
new file mode 100644
index 0000000..dd99cf9
--- /dev/null
+++ b/dnn/generated_fusedopvariantparams.go
@@ -0,0 +1,56 @@
+package cudnn
+
+/* WAS Generated by gencudnn. DO NOT EDIT */
+
+// #include <cudnn.h>
+import "C"
+import (
+	"runtime"
+	"unsafe"
+)
+
+// FusedOpVariantParams is a representation of cudnnFusedOpsVariantParamPack_t.
+type FusedOpVariantParams struct {
+	internal C.cudnnFusedOpsVariantParamPack_t
+
+	ops        FusedOps
+	paramLabel FusedOpsVariantParamLabel
+	ptr        Memory
+}
+
+// NewFusedOpVariantParams creates a new FusedOpVariantParams.
+func NewFusedOpVariantParams(ops FusedOps, paramLabel FusedOpsVariantParamLabel, ptr Memory) (retVal *FusedOpVariantParams, err error) {
+	var internal C.cudnnFusedOpsVariantParamPack_t
+	if err := result(C.cudnnCreateFusedOpsVariantParamPack(&internal, ops.C())); err != nil {
+		return nil, err
+	}
+
+	if err := result(C.cudnnSetFusedOpsVariantParamPackAttribute(internal, paramLabel.C(), unsafe.Pointer(ptr.Uintptr()))); err != nil {
+		return nil, err
+	}
+
+	retVal = &FusedOpVariantParams{
+		internal:   internal,
+		paramLabel: paramLabel,
+		ptr:        ptr,
+		ops:        ops,
+	}
+	runtime.SetFinalizer(retVal, destroyFusedOpVariantParams)
+	return retVal, nil
+}
+
+// C returns the internal cgo representation.
+func (f *FusedOpVariantParams) C() C.cudnnFusedOpsVariantParamPack_t { return f.internal }
+
+// Ops returns the ops that are fused.
+func (f *FusedOpVariantParams) Ops() FusedOps { return f.ops }
+
+// ParamLabel returns the internal paramLabel.
+func (f *FusedOpVariantParams) ParamLabel() FusedOpsVariantParamLabel { return f.paramLabel }
+
+// Ptr returns the internal ptr.
+func (f *FusedOpVariantParams) Ptr() Memory { return f.ptr }
+
+func destroyFusedOpVariantParams(obj *FusedOpVariantParams) {
+	C.cudnnDestroyFusedOpsVariantParamPack(obj.internal)
+}
diff --git a/dnn/generated_lrn.go b/dnn/generated_lrn.go
index 2861230..c4d5e67 100644
--- a/dnn/generated_lrn.go
+++ b/dnn/generated_lrn.go
@@ -38,6 +38,9 @@ func NewLRN(lrnN uint, lrnAlpha float64, lrnBeta float64, lrnK float64) (retVal
 	return retVal, nil
 }
 
+// C returns the internal cgo representation
+func (l *LRN) C() C.cudnnLRNDescriptor_t { return l.internal }
+
 // LrnN returns the internal lrnN.
 func (l *LRN) LrnN() uint { return l.lrnN }
 
diff --git a/dnn/generated_reduction.go b/dnn/generated_reduction.go
index c3ac722..3b119e5 100644
--- a/dnn/generated_reduction.go
+++ b/dnn/generated_reduction.go
@@ -40,6 +40,9 @@ func NewReduction(reduceTensorOp ReduceTensorOp, reduceTensorCompType DataType,
 	return retVal, nil
 }
 
+// C returns the internal cgo representation.
+func (r *Reduction) C() C.cudnnReduceTensorDescriptor_t { return r.internal }
+
 // ReduceTensorOp returns the internal reduceTensorOp.
 func (r *Reduction) ReduceTensorOp() ReduceTensorOp { return r.reduceTensorOp }
 
diff --git a/dnn/generated_rnndata.go b/dnn/generated_rnndata.go
new file mode 100644
index 0000000..efe777c
--- /dev/null
+++ b/dnn/generated_rnndata.go
@@ -0,0 +1,77 @@
+package cudnn
+
+/* Generated by gencudnn. DO NOT EDIT */
+
+// #include <cudnn.h>
+import "C"
+import (
+	"runtime"
+	"unsafe"
+)
+
+// RNNData is a representation of cudnnRNNDataDescriptor_t.
+type RNNData struct {
+	internal C.cudnnRNNDataDescriptor_t
+
+	dataType       DataType
+	layout         RNNDataLayout
+	maxSeqLength   int
+	batchSize      int
+	vectorSize     int
+	seqLengthArray []int
+	paddingFill    Memory
+}
+
+// NewRNNData creates a new RNNData.
+func NewRNNData(dataType DataType, layout RNNDataLayout, maxSeqLength int, batchSize int, vectorSize int, seqLengthArray []int, paddingFill Memory) (retVal *RNNData, err error) {
+	var internal C.cudnnRNNDataDescriptor_t
+	if err := result(C.cudnnCreateRNNDataDescriptor(&internal)); err != nil {
+		return nil, err
+	}
+
+	seqLengthArrayC, seqLengthArrayCManaged := ints2CIntPtr(seqLengthArray)
+	defer returnManaged(seqLengthArrayCManaged)
+
+	if err := result(C.cudnnSetRNNDataDescriptor(internal, dataType.C(), layout.C(), C.int(maxSeqLength), C.int(batchSize), C.int(vectorSize), seqLengthArrayC, unsafe.Pointer(paddingFill.Uintptr()))); err != nil {
+		return nil, err
+	}
+
+	retVal = &RNNData{
+		internal:       internal,
+		dataType:       dataType,
+		layout:         layout,
+		maxSeqLength:   maxSeqLength,
+		batchSize:      batchSize,
+		vectorSize:     vectorSize,
+		seqLengthArray: seqLengthArray,
+		paddingFill:    paddingFill,
+	}
+	runtime.SetFinalizer(retVal, destroyRNNData)
+	return retVal, nil
+}
+
+// C() returns the internal cgo representation of RNNData
+func (r *RNNData) C() C.cudnnRNNDataDescriptor_t { return r.internal }
+
+// DataType returns the internal dataType.
+func (r *RNNData) DataType() DataType { return r.dataType }
+
+// Layout returns the internal layout.
+func (r *RNNData) Layout() RNNDataLayout { return r.layout }
+
+// MaxSeqLength returns the internal maxSeqLength.
+func (r *RNNData) MaxSeqLength() int { return r.maxSeqLength }
+
+// BatchSize returns the internal batchSize.
+func (r *RNNData) BatchSize() int { return r.batchSize }
+
+// VectorSize returns the internal vectorSize.
+func (r *RNNData) VectorSize() int { return r.vectorSize }
+
+// PaddingFill returns the internal paddingFill.
+func (r *RNNData) PaddingFill() Memory { return r.paddingFill }
+
+// SeqLengthArray returns the internal `seqLengthArray` slice.
+func (r *RNNData) SeqLengthArray() []int { return r.seqLengthArray }
+
+func destroyRNNData(obj *RNNData) { C.cudnnDestroyRNNDataDescriptor(obj.internal) }
diff --git a/dnn/generated_seqdata.go b/dnn/generated_seqdata.go
new file mode 100644
index 0000000..3f040c0
--- /dev/null
+++ b/dnn/generated_seqdata.go
@@ -0,0 +1,86 @@
+package cudnn
+
+/* WAS Generated by gencudnn. DO NOT EDIT */
+
+// #include <cudnn.h>
+import "C"
+import (
+	"runtime"
+	"unsafe"
+)
+
+// SeqData is a representation of cudnnSeqDataDescriptor_t.
+type SeqData struct {
+	internal C.cudnnSeqDataDescriptor_t
+
+	dataType           DataType
+	nbDims             int
+	dimA               []int
+	axes               []SeqDataAxis
+	seqLengthArraySize uintptr
+	seqLengthArray     []int
+	paddingFill        Memory
+}
+
+// NewSeqData creates a new SeqData.
+func NewSeqData(dataType DataType, nbDims int, dimA []int, axes []SeqDataAxis, seqLengthArraySize uintptr, seqLengthArray []int, paddingFill Memory) (retVal *SeqData, err error) {
+	var internal C.cudnnSeqDataDescriptor_t
+	if err := result(C.cudnnCreateSeqDataDescriptor(&internal)); err != nil {
+		return nil, err
+	}
+
+	dimAC, dimACManaged := ints2CIntPtr(dimA)
+	defer returnManaged(dimACManaged)
+
+	seqLengthArrayC, seqLengthArrayCManaged := ints2CIntPtr(seqLengthArray)
+	defer returnManaged(seqLengthArrayCManaged)
+
+	if err := result(C.cudnnSetSeqDataDescriptor(internal, dataType.C(), C.int(nbDims), dimAC, axes2Ptr(axes), C.size_t(seqLengthArraySize), seqLengthArrayC, unsafe.Pointer(paddingFill.Uintptr()))); err != nil {
+		return nil, err
+	}
+
+	retVal = &SeqData{
+		internal:           internal,
+		dataType:           dataType,
+		nbDims:             nbDims,
+		dimA:               dimA,
+		axes:               axes,
+		seqLengthArraySize: seqLengthArraySize,
+		seqLengthArray:     seqLengthArray,
+		paddingFill:        paddingFill,
+	}
+	runtime.SetFinalizer(retVal, destroySeqData)
+	return retVal, nil
+}
+
+// C() returns the internal cgo representation
+func (s *SeqData) C() C.cudnnSeqDataDescriptor_t { return s.internal }
+
+// DataType returns the internal dataType.
+func (s *SeqData) DataType() DataType { return s.dataType }
+
+// NbDims returns the internal nbDims.
+func (s *SeqData) NbDims() int { return s.nbDims }
+
+// DimA returns the internal dimA slice.
+func (s *SeqData) DimA() []int { return s.dimA }
+
+// Axes returns the internal axes.
+func (s *SeqData) Axes() []SeqDataAxis { return s.axes }
+
+// SeqLengthArraySize returns the internal seqLengthArraySize.
+func (s *SeqData) SeqLengthArraySize() uintptr { return s.seqLengthArraySize }
+
+// SeqLengthArray returns the internal `seqLengthArray` slice.
+func (s *SeqData) SeqLengthArray() []int { return s.seqLengthArray }
+
+// PaddingFill returns the internal paddingFill.
+func (s *SeqData) PaddingFill() Memory { return s.paddingFill }
+
+func destroySeqData(obj *SeqData) { C.cudnnDestroySeqDataDescriptor(obj.internal) }
+
+/* UTIL */
+
+func axes2Ptr(a []SeqDataAxis) (ptr *C.cudnnSeqDataAxis_t) {
+	return (*C.cudnnSeqDataAxis_t)(unsafe.Pointer(&a[0]))
+}
diff --git a/dnn/generated_spatialtransformer.go b/dnn/generated_spatialtransformer.go
index fb27e05..e3b6ca9 100644
--- a/dnn/generated_spatialtransformer.go
+++ b/dnn/generated_spatialtransformer.go
@@ -27,6 +27,7 @@ func NewSpatialTransformer(samplerType SamplerType, dataType DataType, nbDims in
 
 	dimAC, dimACManaged := ints2CIntPtr(dimA)
 	defer returnManaged(dimACManaged)
+
 	if err := result(C.cudnnSetSpatialTransformerNdDescriptor(internal, samplerType.C(), dataType.C(), C.int(nbDims), dimAC)); err != nil {
 		return nil, err
 	}
@@ -42,6 +43,9 @@ func NewSpatialTransformer(samplerType SamplerType, dataType DataType, nbDims in
 	return retVal, nil
 }
 
+// C() returns the internal C representation of the Spatial Transformer
+func (s *SpatialTransformer) C() C.cudnnSpatialTransformerDescriptor_t { return s.internal }
+
 // SamplerType returns the internal samplerType.
 func (s *SpatialTransformer) SamplerType() SamplerType { return s.samplerType }
 
@@ -51,7 +55,8 @@ func (s *SpatialTransformer) DataType() DataType { return s.dataType }
 // NbDims returns the internal nbDims.
 func (s *SpatialTransformer) NbDims() int { return s.nbDims }
 
-//TODO: "cudnnSetSpatialTransformerNdDescriptor": Parameter 4 Skipped "dimA" of const int[] - unmapped type
+// DimA returns the internal `dimA` slice.
+func (s *SpatialTransformer) DimA() []int { return s.dimA }
 
 func destroySpatialTransformer(obj *SpatialTransformer) {
 	C.cudnnDestroySpatialTransformerDescriptor(obj.internal)
diff --git a/dnn/generated_tensortransform.go b/dnn/generated_tensortransform.go
new file mode 100644
index 0000000..5c026b6
--- /dev/null
+++ b/dnn/generated_tensortransform.go
@@ -0,0 +1,72 @@
+package cudnn
+
+/* WAS Generated by gencudnn. DO NOT EDIT */
+
+// #include <stdint.h>
+// #include <cudnn.h>
+import "C"
+import "runtime"
+
+// TensorTransform is a representation of cudnnTensorTransformDescriptor_t.
+type TensorTransform struct {
+	internal C.cudnnTensorTransformDescriptor_t
+
+	nbDims     uint32
+	destFormat TensorFormat
+	padBeforeA []int32
+	padAfterA  []int32
+	foldA      []uint32
+	direction  FoldingDirection
+}
+
+// NewTensorTransform creates a new TensorTransform.
+func NewTensorTransform(nbDims uint32, destFormat TensorFormat, padBeforeA []int32, padAfterA []int32, foldA []uint32, direction FoldingDirection) (retVal *TensorTransform, err error) {
+	var internal C.cudnnTensorTransformDescriptor_t
+	if err := result(C.cudnnCreateTensorTransformDescriptor(&internal)); err != nil {
+		return nil, err
+	}
+
+	padBeforeAC := int32s2CInt32Ptr(padBeforeA)
+	padAfterAC := int32s2CInt32Ptr(padAfterA)
+	foldAC := uint32s2CUint32Ptr(foldA)
+
+	if err := result(C.cudnnSetTensorTransformDescriptor(internal, C.uint32_t(nbDims), destFormat.C(), padBeforeAC, padAfterAC, foldAC, direction.C())); err != nil {
+		return nil, err
+	}
+
+	retVal = &TensorTransform{
+		internal:   internal,
+		nbDims:     nbDims,
+		destFormat: destFormat,
+		padBeforeA: padBeforeA,
+		padAfterA:  padAfterA,
+		foldA:      foldA,
+		direction:  direction,
+	}
+	runtime.SetFinalizer(retVal, destroyTensorTransform)
+	return retVal, nil
+}
+
+// C returns the cgo representation
+func (t *TensorTransform) C() C.cudnnTensorTransformDescriptor_t { return t.internal }
+
+func (t *TensorTransform) NDims() uint32 { return t.nbDims }
+
+// DestFormat returns the internal destFormat.
+func (t *TensorTransform) DestFormat() TensorFormat { return t.destFormat }
+
+// PadBeforeA returns the internal `padBeforeA` padding slice.
+func (t *TensorTransform) PadBeforeA() []int32 { return t.padBeforeA }
+
+// PadAfterA returns the internal `padAfterA` padding slice.
+func (t *TensorTransform) PadAfterA() []int32 { return t.padAfterA }
+
+// FoldA returns the internal `foldA` slice.
+func (t *TensorTransform) FoldA() []uint32 { return t.foldA }
+
+// Direction returns the internal direction.
+func (t *TensorTransform) Direction() FoldingDirection { return t.direction }
+
+func destroyTensorTransform(obj *TensorTransform) {
+	C.cudnnDestroyTensorTransformDescriptor(obj.internal)
+}
diff --git a/dnn/handle.go b/dnn/handle.go
index 9f29e80..7e56735 100644
--- a/dnn/handle.go
+++ b/dnn/handle.go
@@ -2,14 +2,10 @@ package cudnn
 
 // #include <cudnn.h>
 import "C"
-import (
-	"unsafe"
-)
 
 // Memory represents an instance of CUDA memory
 type Memory interface {
 	Uintptr() uintptr
-	Pointer() unsafe.Pointer
 
 	IsNativelyAccessible() bool
 }
diff --git a/dnn/optensor.go b/dnn/optensor.go
index 871fab0..89e04cd 100644
--- a/dnn/optensor.go
+++ b/dnn/optensor.go
@@ -101,9 +101,9 @@ func (ctx *Context) DoOp(op *Op,
 	}
 
 	res := C.cudnnOpTensor(ctx.internal, op.internal,
-		alpha1C, aDesc.internal, aData.Pointer(),
-		alpha2C, bDesc.internal, bData.Pointer(),
-		betaC, cDesc.internal, cData.Pointer(),
+		alpha1C, aDesc.internal, unsafe.Pointer(aData.Uintptr()),
+		alpha2C, bDesc.internal, unsafe.Pointer(bData.Uintptr()),
+		betaC, cDesc.internal, unsafe.Pointer(cData.Uintptr()),
 	)
 	return result(res)
 }
diff --git a/dnn/rnn.go b/dnn/rnn.go
index 033a806..16abfbe 100644
--- a/dnn/rnn.go
+++ b/dnn/rnn.go
@@ -57,6 +57,7 @@ type RNN struct {
 	workspaceSize uintptr
 }
 
+/*
 // NewRNN creates a new RNN.
 func (handle *Context) NewRNN(hiddenSize int, numLayers int, dropout *Dropout, inputMode RNNInputMode, direction DirectionMode, mode RNNMode, algo RNNAlgo, dataType DataType) (retVal *RNN, err error) {
 	var internal C.cudnnRNNDescriptor_t
@@ -64,7 +65,7 @@ func (handle *Context) NewRNN(hiddenSize int, numLayers int, dropout *Dropout, i
 		return nil, err
 	}
 
-	if err := result(C.cudnnSetRNNDescriptor(handle.internal, internal, C.int(hiddenSize), C.int(numLayers), dropout.internal, inputMode.C(), direction.C(), mode.C(), algo.C(), dataType.C())); err != nil {
+	if err := result(C.cudnnSetRNNDescriptor_v8(handle.internal, internal, C.int(hiddenSize), C.int(numLayers), dropout.internal, inputMode.C(), direction.C(), mode.C(), algo.C(), dataType.C())); err != nil {
 		return nil, err
 	}
 
@@ -83,7 +84,7 @@ func (handle *Context) NewRNN(hiddenSize int, numLayers int, dropout *Dropout, i
 	}
 	runtime.SetFinalizer(retVal, destroyRNN)
 	return retVal, nil
-}
+}*/
 
 func (r *RNN) HiddenSize() int              { return r.hiddenSize }
 func (r *RNN) NumLayers() int               { return r.layers }
diff --git a/dnn/status.go b/dnn/status.go
index 48bba68..f9d7bc0 100644
--- a/dnn/status.go
+++ b/dnn/status.go
@@ -3,10 +3,13 @@ package cudnn
 // #include <cudnn.h>
 import "C"
 
+type Status = cudnnStatus
+
 type cudnnStatus int
 
-func (err cudnnStatus) Error() string  { return err.String() }
-func (err cudnnStatus) String() string { return resString[err] }
+func (err cudnnStatus) Error() string      { return err.String() }
+func (err cudnnStatus) String() string     { return resString[err] }
+func (err cudnnStatus) C() C.cudnnStatus_t { return C.cudnnStatus_t(err) }
 
 func result(x C.cudnnStatus_t) error {
 	err := cudnnStatus(x)
diff --git a/dnn/tensor.go b/dnn/tensor.go
index 23046b8..d926db5 100644
--- a/dnn/tensor.go
+++ b/dnn/tensor.go
@@ -4,8 +4,6 @@ package cudnn
 import "C"
 import (
 	"runtime"
-
-	"github.com/pkg/errors"
 )
 
 type TensorDescriptor struct {
@@ -40,14 +38,15 @@ func NewTensorDescriptor(format TensorFormat, dt DataType, shape, strides []int)
 }
 
 func (t *TensorDescriptor) set(internal C.cudnnTensorDescriptor_t) error {
+
 	switch len(t.shape) {
 	case 4:
-		N, C, H, W := t.shape[0], t.shape[1], t.shape[2], t.shape[3]
+		n, c, h, w := t.shape[0], t.shape[1], t.shape[2], t.shape[3]
 		if len(t.strides) == 4 {
 			// use explicit
 			NStrides, CStrides, HStrides, WStrides := t.strides[0], t.strides[1], t.strides[2], t.strides[3]
 			res := C.cudnnSetTensor4dDescriptorEx(internal, t.dataType.C(),
-				C.int(N), C.int(C), C.int(H), C.int(W),
+				C.int(n), C.int(c), C.int(h), C.int(w),
 				C.int(NStrides), C.int(CStrides), C.int(HStrides), C.int(WStrides),
 			)
 			return result(res)
@@ -55,7 +54,7 @@ func (t *TensorDescriptor) set(internal C.cudnnTensorDescriptor_t) error {
 
 		// otherwise the strides will be calculated by cudnn
 		res := C.cudnnSetTensor4dDescriptor(internal, t.format.C(), t.dataType.C(),
-			C.int(N), C.int(C), C.int(H), C.int(W),
+			C.int(n), C.int(c), C.int(h), C.int(w),
 		)
 		return result(res)
 	default:
@@ -75,8 +74,6 @@ func (t *TensorDescriptor) set(internal C.cudnnTensorDescriptor_t) error {
 			C.int(len(t.shape)), dimA)
 		return result(res)
 	}
-
-	return errors.Errorf(nyi, "set for len == ", len(t.shape))
 }
 
 func (t *TensorDescriptor) Format() TensorFormat { return t.format }
diff --git a/dnn/todo.go b/dnn/todo.go
new file mode 100644
index 0000000..0ec4b30
--- /dev/null
+++ b/dnn/todo.go
@@ -0,0 +1,24 @@
+package cudnn
+
+// #include <cudnn.h>
+import "C"
+
+// TODO
+
+/*
+func (ctx *Context) GetRNNLinLayerBiasParams(rnnDesc *RNN, pseudoLayer int, xDesc *TensorDescriptor, wDesc *Filter, w Memory, linLayerID int, linLayerBiasDesc *Filter, linLayerBias TODO) error {
+	// call cudnnGetRNNLinLayerBiasParams
+	return result(C.cudnnGetRNNLinLayerBiasParams(ctx.internal, rnnDesc.internal, C.int(pseudoLayer), xDesc.internal, wDesc.internal, unsafe.Pointer(w.Uintptr()), C.int(linLayerID), linLayerBiasDesc.internal, linLayerBias))
+}
+func (ctx *Context) GetRNNLinLayerMatrixParams(rnnDesc *RNN, pseudoLayer int, xDesc *TensorDescriptor, wDesc *Filter, w Memory, linLayerID int, linLayerMatDesc *Filter, linLayerMat TODO) error {
+	// call cudnnGetRNNLinLayerMatrixParams
+	return result(C.cudnnGetRNNLinLayerMatrixParams(ctx.internal, rnnDesc.internal, C.int(pseudoLayer), xDesc.internal, wDesc.internal, unsafe.Pointer(w.Uintptr()), C.int(linLayerID), linLayerMatDesc.internal, linLayerMat))
+}
+
+// Input. Handle to a previously created cuDNN context. For more information, see cudnnHandle_t.
+func (co *Context) CTCLoss(probsDesc *TensorDescriptor, probs Memory, hostLabels TODO, hostLabelLengths TODO, hostInputLengths TODO, costs Memory, gradientsDesc *TensorDescriptor, gradients Memory, algo CTCLossAlgo, ctcLossDesc *CTCLoss, workspace Memory, workSpaceSizeInBytes uintptr) error {
+	// DOUBLECHECK: "cudnnCTCLoss" returns Memory type in Parameter 8
+	// call cudnnCTCLoss
+	return result(C.cudnnCTCLoss(co.internal, probsDesc.internal, unsafe.Pointer(probs.Uintptr()), hostLabels, hostLabelLengths, hostInputLengths, unsafe.Pointer(costs.Uintptr()), gradientsDesc.internal, unsafe.Pointer(gradients.Uintptr()), algo.C(), ctcLossDesc.internal, unsafe.Pointer(workspace.Uintptr()), C.size_t(workSpaceSizeInBytes)))
+}
+*/
diff --git a/execution.go b/execution.go
index a5e71d8..c31536f 100644
--- a/execution.go
+++ b/execution.go
@@ -74,3 +74,9 @@ func (ctx *Ctx) LaunchKernel(fn Function, gridDimX, gridDimY, gridDimZ int, bloc
 
 	ctx.err = ctx.Do(f)
 }
+
+// STUB
+func LaunchCooperativeKernel() {}
+
+// STUB
+func LauncCooperativeKernelMultiDevice() {}
diff --git a/fake.go b/fake.go
new file mode 100644
index 0000000..ff25db0
--- /dev/null
+++ b/fake.go
@@ -0,0 +1,58 @@
+package cu
+
+/*
+#include <stdlib.h>
+void handleCUDACB(void* v);
+*/
+import "C"
+import (
+	"sync"
+	"unsafe"
+)
+
+// fake.go handles faking of C pointers of Go functions
+
+var fakepointers = make(map[unsafe.Pointer]HostFunction)
+var lock sync.RWMutex
+
+// RegisterFunc is used to register a Go based callback such that it may be called by CUDA.
+func RegisterFunc(fn HostFunction) unsafe.Pointer {
+	var ptr unsafe.Pointer = C.malloc(C.size_t(1))
+	if ptr == nil {
+		panic("Cannot allocate a fake pointer")
+	}
+
+	lock.Lock()
+	fakepointers[ptr] = fn
+	lock.Unlock()
+
+	return ptr
+}
+
+func getHostFn(ptr unsafe.Pointer) HostFunction {
+	if ptr == nil {
+		return nil
+	}
+	lock.RLock()
+	retVal := fakepointers[ptr]
+	lock.RUnlock()
+	return retVal
+}
+
+func deregisterFunc(ptr unsafe.Pointer) {
+	if ptr == nil {
+		return
+	}
+
+	lock.Lock()
+	delete(fakepointers, ptr)
+	lock.Unlock()
+
+	C.free(ptr)
+}
+
+//export handleCUDACB
+func handleCUDACB(fn unsafe.Pointer) {
+	callback := getHostFn(fn)
+	callback()
+}
diff --git a/go.mod b/go.mod
index 24d0906..592ec70 100644
--- a/go.mod
+++ b/go.mod
@@ -5,17 +5,15 @@ go 1.13
 require (
 	github.com/cloudflare/cfssl v0.0.0-20190808011637-b1ec8c586c2a
 	github.com/cznic/cc v0.0.0-20181122101902-d673e9b70d4d
-	github.com/cznic/golex v0.0.0-20181122101858-9c343928389c // indirect
-	github.com/cznic/mathutil v0.0.0-20181122101859-297441e03548 // indirect
-	github.com/cznic/strutil v0.0.0-20181122101858-275e90344537 // indirect
-	github.com/cznic/xc v0.0.0-20181122101856-45b06973881e
 	github.com/gonum/blas v0.0.0-20181208220705-f22b278b28ac
-	github.com/gorgonia/bindgen v0.0.0-20180812032444-09626750019e
+	github.com/google/uuid v1.1.1
+	github.com/gorgonia/bindgen v0.0.0-20210223094355-432cd89e7765
 	github.com/kr/pretty v0.1.0
-	github.com/pkg/errors v0.8.1
-	github.com/remyoudompheng/bigfft v0.0.0-20190728182440-6a916e37a237 // indirect
-	github.com/stretchr/testify v1.4.0
-	gonum.org/v1/gonum v0.0.0-20190902003836-43865b531bee
-	gorgonia.org/gorgonia v0.9.2
-	gorgonia.org/tensor v0.9.0-beta
+	github.com/pkg/errors v0.9.1
+	github.com/stretchr/testify v1.6.1
+	gonum.org/v1/gonum v0.8.2
+	gorgonia.org/gorgonia v0.9.17
+	gorgonia.org/tensor v0.9.20
+	modernc.org/cc v1.0.1
+	modernc.org/xc v1.0.0
 )
diff --git a/go.sum b/go.sum
index 26b4ed6..387a679 100644
--- a/go.sum
+++ b/go.sum
@@ -1,12 +1,23 @@
+cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
+github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
+github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
 github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
+github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc/go.mod h1:c9sxoIT3YgLxH4UhLOCKaBlEojuMhVYpk4Ntv3opUTQ=
+github.com/apache/arrow/go/arrow v0.0.0-20210105145422-88aaea5262db h1:x5taMU/KYJ8djMqp6eLMHQdcf6RZ+19lmAH7XTK6tmo=
+github.com/apache/arrow/go/arrow v0.0.0-20210105145422-88aaea5262db/go.mod h1:c9sxoIT3YgLxH4UhLOCKaBlEojuMhVYpk4Ntv3opUTQ=
 github.com/awalterschulze/gographviz v0.0.0-20190221210632-1e9ccb565bca h1:xwIXr1FpA2XBoohlpvgb11No/zbsh5Clm/98PWPcHVA=
 github.com/awalterschulze/gographviz v0.0.0-20190221210632-1e9ccb565bca/go.mod h1:GEV5wmg4YquNw7v1kkyoX9etIk8yVmXj+AkDHuuETHs=
+github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
 github.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k=
 github.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0=
-github.com/chewxy/math32 v1.0.0 h1:RTt2SACA7BTzvbsAKVQJLZpV6zY2MZw4bW9L2HEKkHg=
 github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0=
+github.com/chewxy/math32 v1.0.6/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
+github.com/chewxy/math32 v1.0.7-0.20210223031236-a3549c8cb6a9 h1:tYETMGvGcSl1GOLy7hjtvueM/Ax1rn9hpeD3fgbNdT0=
+github.com/chewxy/math32 v1.0.7-0.20210223031236-a3549c8cb6a9/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs=
+github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
 github.com/cloudflare/cfssl v0.0.0-20190808011637-b1ec8c586c2a h1:ym8P2+ZvUvVtpLzy8wFLLvdggUIU31mvldvxixQQI2o=
 github.com/cloudflare/cfssl v0.0.0-20190808011637-b1ec8c586c2a/go.mod h1:yMWuSON2oQp+43nFtAV/uvKQIFpSPerB57DCt9t8sSA=
+github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
 github.com/cznic/cc v0.0.0-20181122101902-d673e9b70d4d h1:AePLLLsGE1yOEDAmaJlQ9zd/9qiaEVskYukZ1f2srAA=
 github.com/cznic/cc v0.0.0-20181122101902-d673e9b70d4d/go.mod h1:m3fD/V+XTB35Kh9zw6dzjMY+We0Q7PMf6LLIC4vuG9k=
 github.com/cznic/golex v0.0.0-20181122101858-9c343928389c h1:G8zTsaqyVfIHpgMFcGgdbhHSFhlNc77rAKkhVbQ9kQg=
@@ -19,20 +30,53 @@ github.com/cznic/xc v0.0.0-20181122101856-45b06973881e h1:U9mUTtTukbCdFuphv3QiJB
 github.com/cznic/xc v0.0.0-20181122101856-45b06973881e/go.mod h1:3oFoiOvCDBYH+swwf5+k/woVmWy7h1Fcyu8Qig/jjX0=
 github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
+github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
+github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
+github.com/fatih/color v1.10.0/go.mod h1:ELkj/draVOlAH/xkhN6mQ50Qd0MPOk5AAr3maGEBuJM=
 github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
-github.com/gogo/protobuf v1.2.1 h1:/s5zKNz0uPFCZ5hddgPdo2TK2TVrUNMn0OOX8/aZMTE=
 github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=
+github.com/gogo/protobuf v1.3.1 h1:DqDEcV5aeaTmdFBePNpYsp3FlcVH/2ISVVM9Qf8PSls=
+github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o=
 github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
-github.com/golang/protobuf v1.3.0 h1:kbxbvI4Un1LUWKxufD+BiE6AEExYYgkQLQmLFqA1LFk=
+github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
+github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
+github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 github.com/golang/protobuf v1.3.0/go.mod h1:Qd/q+1AKNOZr9uGQzbzCmRO6sUih6GTPZv6a1/R87v0=
+github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
+github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
+github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
+github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
+github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
+github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
+github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
+github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
+github.com/golang/protobuf v1.4.3 h1:JjCZWpVbqXDqFVmTfYWEVTMIYrL/NPdPSCHPJ0T/raM=
+github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
 github.com/gonum/blas v0.0.0-20181208220705-f22b278b28ac h1:Q0Jsdxl5jbxouNs1TQYt0gxesYMU4VXRbsTlgDloZ50=
 github.com/gonum/blas v0.0.0-20181208220705-f22b278b28ac/go.mod h1:P32wAyui1PQ58Oce/KYkOqQv8cVw1zAapXOl+dRFGbc=
-github.com/google/flatbuffers v1.10.0 h1:wHCM5N1xsJ3VwePcIpVqnmjAqRXlR44gv4hpGi+/LIw=
 github.com/google/flatbuffers v1.10.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
-github.com/gorgonia/bindgen v0.0.0-20180812032444-09626750019e h1:s7hYZXDub8rGKuh712Cdd+zAuti5trhwWmEZvjWnxEw=
+github.com/google/flatbuffers v1.11.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
+github.com/google/flatbuffers v1.12.0 h1:/PtAHvnBY4Kqnx/xCQ3OIV9uYcSFGScBsWI3Oogeh6w=
+github.com/google/flatbuffers v1.12.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
+github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
+github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.3 h1:x95R7cp+rSeeqAMI2knLtQ0DKlaBhv2NrtrOvafPHRo=
+github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/uuid v1.1.1 h1:Gkbcsh/GbpXz7lPftLA3P6TYMwjCLYm83jiFQZF/3gY=
+github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/gorgonia/bindgen v0.0.0-20180812032444-09626750019e/go.mod h1:YzKk63P9jQHkwAo2rXHBv02yPxDzoQT2cBV0x5bGV/8=
+github.com/gorgonia/bindgen v0.0.0-20210223094355-432cd89e7765 h1:O6vP9G95HjXjdSOLqZqzvPZPO9jkw2DU2OJ1ewJiclI=
+github.com/gorgonia/bindgen v0.0.0-20210223094355-432cd89e7765/go.mod h1:BLHSe436vhQKRfm6wxJgebeK4fDY+ER/8jV3vVH9yYU=
 github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
 github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q=
+github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
 github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
@@ -41,55 +85,163 @@ github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/leesper/go_rng v0.0.0-20171009123644-5344a9259b21 h1:O75p5GUdUfhJqNCMM1ntthjtJCOHVa1lzMSfh5Qsa0Y=
 github.com/leesper/go_rng v0.0.0-20171009123644-5344a9259b21/go.mod h1:N0SVk0uhy+E1PZ3C9ctsPRlvOPAFPkCNlcPBDkt0N3U=
+github.com/mattn/go-colorable v0.1.8/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc=
+github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU=
 github.com/mattn/go-runewidth v0.0.4/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
-github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I=
 github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/remyoudompheng/bigfft v0.0.0-20190728182440-6a916e37a237 h1:HQagqIiBmr8YXawX/le3+O26N+vPPC1PtjaF3mwnook=
+github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
+github.com/remyoudompheng/bigfft v0.0.0-20170806203942-52369c62f446/go.mod h1:uYEyJGbgTkfkS4+E/PavXkNJcbFIpEtjt2B0KDQ5+9M=
 github.com/remyoudompheng/bigfft v0.0.0-20190728182440-6a916e37a237/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
+github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0 h1:OdAsTTz6OkFY5QxjkYwrChwuRruF69c169dPK26NUlk=
+github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.1.4/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
+github.com/stretchr/testify v1.2.0/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
-github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
 github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
+github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0=
+github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY=
 github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8=
+go4.org/unsafe/assume-no-moving-gc v0.0.0-20201222180813-1025295fd063 h1:1tk03FUNpulq2cuWpXZWj649rwJpk0d20rxWiopKRmc=
+go4.org/unsafe/assume-no-moving-gc v0.0.0-20201222180813-1025295fd063/go.mod h1:FftLjUGFEDu5k8lt0ddY+HcrH/qU/0qk+H8j9/nTl3E=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
-golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2 h1:y102fOLFqhV41b+4GPiJoa0k/x+pJcEi2/HB1Y5T6fU=
+golang.org/x/exp v0.0.0-20181106170214-d68db9428509/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/exp v0.0.0-20190312203227-4b39c73a6495 h1:I6A9Ag9FpEKOjcKrRNjQkPHawoXIhKyTGfvvjFAiiAk=
+golang.org/x/exp v0.0.0-20190312203227-4b39c73a6495/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
 golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
-golang.org/x/net v0.0.0-20180906233101-161cd47e91fd h1:nTDtHvHSdCn1m6ITfMRqtOd/9+7a3s8RBNOZ3eYZzJA=
+golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
+golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
+golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
+golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE=
+golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
-golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f h1:wMNYb4v58l5UBM7MYRLPG6ZhfOqbKu7X5eyFl8ZhKvA=
+golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20200904194848-62affa334b73 h1:MXfv8rhZWmFeqX3GNZRsd6vOLoaCHjYEX3qkRo3YBUA=
+golang.org/x/net v0.0.0-20200904194848-62affa334b73/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
+golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190226215855-775f8194d0f9/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200909081042-eff7692f9009 h1:W0lCpv29Hv0UaM1LXb9QlBHLNP8UFfcKjblhVCWftOM=
+golang.org/x/sys v0.0.0-20200909081042-eff7692f9009/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
+golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
 gonum.org/v1/gonum v0.0.0-20190226202314-149afe6ec0b6/go.mod h1:jevfED4GnIEnJrWW55YmY9DMhajHcnkqVnEXmEtMyNI=
-gonum.org/v1/gonum v0.0.0-20190902003836-43865b531bee h1:4pVWuAEGpaPZ7dPfd6aA8LyDNzMA2RKCxAS/XNCLZUM=
 gonum.org/v1/gonum v0.0.0-20190902003836-43865b531bee/go.mod h1:9mxDZsDKxgMAuccQkewq682L+0eCu4dCN2yonUJTCLU=
+gonum.org/v1/gonum v0.8.1-0.20200930085651-eea0b5cb5cc9/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
+gonum.org/v1/gonum v0.8.2 h1:CCXrcPKiGGotvnN6jfUsKk4rRqm7q09/YbKb5xCEvtM=
+gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
 gonum.org/v1/netlib v0.0.0-20190221094214-0632e2ebbd2d/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
-gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc=
 gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
+gonum.org/v1/netlib v0.0.0-20201012070519-2390d26c3658 h1:/DNJ3wcvPHjTLVNG6rmSHK7uEwdBihyiJRJXB16wXoU=
+gonum.org/v1/netlib v0.0.0-20201012070519-2390d26c3658/go.mod h1:zQa7n16lh3Z6FbSTYgjG+KNhz1bA/b9t3plFEaGMp+A=
 gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
+google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
+google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
+google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
 google.golang.org/genproto v0.0.0-20180831171423-11092d34479b/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
+google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
+google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
+google.golang.org/genproto v0.0.0-20200911024640-645f7a48b24f h1:Yv4xsIx7HZOoyUGSJ2ksDyWE2qIBXROsZKt2ny3hCGM=
+google.golang.org/genproto v0.0.0-20200911024640-645f7a48b24f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
+google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
+google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
+google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
+google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
+google.golang.org/grpc v1.32.0 h1:zWTV+LMdc3kaiJMSTOFz2UgSBgx8RNQoTGiZu3fR9S0=
+google.golang.org/grpc v1.32.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak=
+google.golang.org/grpc/cmd/protoc-gen-go-grpc v0.0.0-20200910201057-6591123024b3/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw=
+google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
+google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
+google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
+google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
+google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
+google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4=
+google.golang.org/protobuf v1.25.0 h1:Ejskq+SyPohKW+1uil0JJMtmHCgJPJ/qWTxr8qp+R4c=
+google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/cheggaaa/pb.v1 v1.0.27/go.mod h1:V/YB90LKu/1FcN3WVnfiiE5oMCibMjukxqG/qStrOgw=
-gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gorgonia.org/cu v0.9.0-beta/go.mod h1:RPEPIfaxxqUmeRe7T1T8a0NER+KxBI2McoLEXhP1Vd8=
-gorgonia.org/dawson v1.1.0 h1:o7+eJ3SKi9sheH19lpOat//tDbg0Y+M9iY/lH79VHqY=
+gorgonia.org/cu v0.9.3/go.mod h1:LgyAYDkN7HWhh8orGnCY2R8pP9PYbO44ivEbLMatkVU=
 gorgonia.org/dawson v1.1.0/go.mod h1:Px1mcziba8YUBIDsbzGwbKJ11uIblv/zkln4jNrZ9Ws=
-gorgonia.org/gorgonia v0.9.2 h1:yNe0EfWdEbzWuv0cvAeTpP2dfHH9CZrEz5jGG014pZg=
+gorgonia.org/dawson v1.2.0 h1:hJ/aofhfkReSnJdSMDzypRZ/oWDL1TmeYOauBnXKdFw=
+gorgonia.org/dawson v1.2.0/go.mod h1:Px1mcziba8YUBIDsbzGwbKJ11uIblv/zkln4jNrZ9Ws=
 gorgonia.org/gorgonia v0.9.2/go.mod h1:ZtOb9f/wM2OMta1ISGspQ4roGDgz9d9dKOaPNvGR+ec=
-gorgonia.org/tensor v0.9.0-beta h1:16QQufB1vbJxVbIOaB5TwkerdlBWtw+AAnZHUZ531ZE=
+gorgonia.org/gorgonia v0.9.17 h1:CJOQfgQA5fYd24vPiKKf6v98fRk71s1P7d2GjXNRjVE=
+gorgonia.org/gorgonia v0.9.17/go.mod h1:g66b5Z6ATUdhVqYl2ZAAwblv5hnGW08vNinGLcnrceI=
 gorgonia.org/tensor v0.9.0-beta/go.mod h1:05Y4laKuVlj4qFoZIZW1q/9n1jZkgDBOLmKXZdBLG1w=
-gorgonia.org/vecf32 v0.7.0 h1:mkpVzSyT7/Cput5/ZxaMzzp2xbmOtqOyJlTf7AdSMe0=
+gorgonia.org/tensor v0.9.17/go.mod h1:75SMdLLhZ+2oB0/EE8lFEIt1Caoykdd4bz1mAe59deg=
+gorgonia.org/tensor v0.9.20 h1:hYIKZIbpl9LfMKoazvmZLZsbY3CMYLPkJUaMtTr6Fzk=
+gorgonia.org/tensor v0.9.20/go.mod h1:75SMdLLhZ+2oB0/EE8lFEIt1Caoykdd4bz1mAe59deg=
 gorgonia.org/vecf32 v0.7.0/go.mod h1:iHG+kvTMqGYA0SgahfO2k62WRnxmHsqAREGbayRDzy8=
-gorgonia.org/vecf64 v0.7.0 h1:ZphOGJfnWlFfY7x8WAJAfO64IAtYqPPq9TEGem+ItZE=
+gorgonia.org/vecf32 v0.9.0 h1:PClazic1r+JVJ1dEzRXgeiVl4g1/Hf/w+wUSqnco1Xg=
+gorgonia.org/vecf32 v0.9.0/go.mod h1:NCc+5D2oxddRL11hd+pCB1PEyXWOyiQxfZ/1wwhOXCA=
 gorgonia.org/vecf64 v0.7.0/go.mod h1:1y4pmcSd+wh3phG+InwWQjYrqwyrtN9h27WLFVQfV1Q=
+gorgonia.org/vecf64 v0.9.0 h1:bgZDP5x0OzBF64PjMGC3EvTdOoMEcmfAh1VCUnZFm1A=
+gorgonia.org/vecf64 v0.9.0/go.mod h1:hp7IOWCnRiVQKON73kkC/AUMtEXyf9kGlVrtPQ9ccVA=
+honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+modernc.org/cc v1.0.0/go.mod h1:1Sk4//wdnYJiUIxnW8ddKpaOJCF37yAdqYnkxUpaYxw=
+modernc.org/cc v1.0.1 h1:HMzoVgK1dots0bTiIlVqDiQf2TTkOFkccWtnmJZdPdQ=
+modernc.org/cc v1.0.1/go.mod h1:uj1/YV+GYVdtSfGOgOtY62Jz8YIiEC0EzZNq481HIQs=
+modernc.org/fileutil v1.0.0/go.mod h1:JHsWpkrk/CnVV1H/eGlFf85BEpfkrp56ro8nojIq9Q8=
+modernc.org/golex v1.0.0/go.mod h1:b/QX9oBD/LhixY6NDh+IdGv17hgB+51fET1i2kPSmvk=
+modernc.org/golex v1.0.1 h1:EYKY1a3wStt0RzHaH8mdSRNg78Ub0OHxYfCRWw35YtM=
+modernc.org/golex v1.0.1/go.mod h1:QCA53QtsT1NdGkaZZkF5ezFwk4IXh4BGNafAARTC254=
+modernc.org/internal v1.0.0/go.mod h1:VUD/+JAkhCpvkUitlEOnhpVxCgsBI90oTzSCRcqQVSM=
+modernc.org/ir v1.0.0/go.mod h1:wxK1nK3PS04CASoUY+HJr+FQywv4+D38y2sRrd71y7s=
+modernc.org/lex v1.0.0/go.mod h1:G6rxMTy3cH2iA0iXL/HRRv4Znu8MK4higxph/lE7ypk=
+modernc.org/lexer v1.0.0/go.mod h1:F/Dld0YKYdZCLQ7bD0USbWL4YKCyTDRDHiDTOs0q0vk=
+modernc.org/mathutil v1.0.0/go.mod h1:wU0vUrJsVWBZ4P6e7xtFJEhFSNsfRLJ8H458uRjg03k=
+modernc.org/mathutil v1.1.1 h1:FeylZSVX8S+58VsyJlkEj2bcpdytmp9MmDKZkKx8OIE=
+modernc.org/mathutil v1.1.1/go.mod h1:mZW8CKdRPY1v87qxC/wUdX5O1qDzXMP5TH3wjfpga6E=
+modernc.org/strutil v1.1.0 h1:+1/yCzZxY2pZwwrsbH+4T7BQMoLQ9QiBshRC9eicYsc=
+modernc.org/strutil v1.1.0/go.mod h1:lstksw84oURvj9y3tn8lGvRxyRC1S2+g5uuIzNfIOBs=
+modernc.org/token v1.0.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
+modernc.org/xc v1.0.0 h1:7ccXrupWZIS3twbUGrtKmHS2DXY6xegFua+6O3xgAFU=
+modernc.org/xc v1.0.0/go.mod h1:mRNCo0bvLjGhHO9WsyuKVU4q0ceiDDDoEeWDJHrNx8I=
 rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
diff --git a/graph.go b/graph.go
new file mode 100644
index 0000000..b950af9
--- /dev/null
+++ b/graph.go
@@ -0,0 +1,170 @@
+package cu
+
+// #include <cuda.h>
+import "C"
+import (
+	"fmt"
+	"unsafe"
+
+	"github.com/pkg/errors"
+)
+
+// Graph represents a CUDA graph
+type Graph struct{ g C.CUgraph }
+
+func (g Graph) c() C.CUgraph   { return g.g }
+func (g Graph) String() string { return fmt.Sprintf("Graph_0x%x", uintptr(unsafe.Pointer(g.g))) }
+
+// MakeGraph makes a new graph.
+func MakeGraph() (Graph, error) {
+	var g Graph
+	err := result(C.cuGraphCreate(&g.g, C.uint(0))) // flags must be 0
+	return g, err
+}
+
+func (g Graph) Destroy() error { return result(C.cuGraphDestroy(g.g)) }
+
+func (g Graph) Clone() (Graph, error) {
+	var c Graph
+	err := result(C.cuGraphClone(&c.g, g.g))
+	return c, err
+}
+
+// AddDependencies adds edges to the graph. Both `from` and `to` must be the same length.
+// An edge will be added from from[i] to to[i]
+// If an edge already exists between the nodes, then an error will be returned
+func (g Graph) AddDependencies(from, to []Node) error {
+	if len(from) != len(to) {
+		return errors.Errorf("Expected from and to to have the same length. From is %d long. To is %d long", len(from), len(to))
+	}
+	if len(from) == 0 {
+		return nil
+	}
+	var numDependencies C.size_t
+	var fromPtr, toPtr *C.CUgraphNode
+	fromPtr, numDependencies = unsplatNodes(from)
+	toPtr, _ = unsplatNodes(to)
+	return result(C.cuGraphAddDependencies(g.c(), fromPtr, toPtr, numDependencies))
+}
+
+// AddEmptyNode creates an empty node and edds it to the graph. An empty node is a node that performs no operations during execution. It can be used for transitive ordering.
+//  For example, a phased execution graph with 2 groups of n nodes with a barrier between them can be represented using an empty node and 2*n dependency edges, rather than no empty node and n^2 dependency edges.
+func (g Graph) AddEmptyNode(children []Node) (Node, error) {
+	ptr, numDependencies := unsplatNodes(children)
+	var retVal Node
+	err := result(C.cuGraphAddEmptyNode(&retVal.n, g.c(), ptr, numDependencies))
+	return retVal, err
+}
+
+// AddHostNode creates a host execution node and adds it to the graph.
+// When the graph is launched, the node will invoke the specified CPU function. Host nodes are not supported under MPS with pre-Volta GPUs.
+func (g Graph) AddHostNode(children []Node, params *HostNodeParams) (Node, error) {
+	ptr, numDependencies := unsplatNodes(children)
+	var retVal Node
+	err := result(C.cuGraphAddHostNode(&retVal.n, g.c(), ptr, numDependencies, params.c()))
+	return retVal, err
+}
+
+// AddKernelNode creates a kernel execution node and adds it to the graph.
+// When the graph is launched, the node will invoke the specified kernel function.
+//
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g50d871e3bd06c1b835e52f2966ef366b
+func (g Graph) AddKernelNode(children []Node, params *KernelNodeParams) (Node, error) {
+	ptr, numDependencies := unsplatNodes(children)
+	var retVal Node
+	err := result(C.cuGraphAddKernelNode(&retVal.n, g.c(), ptr, numDependencies, params.c()))
+	return retVal, err
+}
+
+// AddMemcpyNode creates a node which performs memcpy.
+//
+// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g674da6ab54a677f13e0e0e8206ff5073
+func (g Graph) AddMemcpyNode(children []Node, params *CopyParams, ctx Context) (Node, error) {
+	ptr, numDependencies := unsplatNodes(children)
+	var retVal Node
+	err := result(C.cuGraphAddMemcpyNode(&retVal.n, g.c(), ptr, numDependencies, params.c(), ctx.CUDAContext().c()))
+	return retVal, err
+}
+
+func (g Graph) AddMemsetNode(children []Node, params *MemsetParams, ctx Context) (Node, error) {
+	ptr, numDependencies := unsplatNodes(children)
+	var retVal Node
+	err := result(C.cuGraphAddMemsetNode(&retVal.n, g.c(), ptr, numDependencies, params.c(), ctx.CUDAContext().c()))
+	return retVal, err
+}
+
+// Edges returns the edges between nodes. CUDA's API is quite dodgy and unclear. It is reproduced below:
+//
+// Returns a list of hGraph's dependency edges. Edges are returned via corresponding indices in from and to; that is, the node in to[i] has a dependency on the node in from[i]. from and to may both be NULL, in which case this function only returns the number of edges in numEdges. Otherwise, numEdges entries will be filled in. If numEdges is higher than the actual number of edges, the remaining entries in from and to will be set to NULL, and the number of edges actually returned will be written to numEdges.
+func (g Graph) Edges(from, to []Node) (edges []int, numEdges int, err error) {
+	if len(from) != len(to) {
+		return nil, -1, errors.Errorf("Expected from and to to have the same length. From is %d long. To is %d long", len(from), len(to))
+	}
+	if len(from) == 0 {
+		return nil, 0, nil // TODO
+	}
+	retVal := make([]C.size_t, len(from))
+	retVal[0] = C.size_t(len(from))
+
+	fromPtr, _ := unsplatNodes(from)
+	toPtr, _ := unsplatNodes(to)
+	retPtr := (*C.size_t)(unsafe.Pointer(&retVal[0]))
+	if err = result(C.cuGraphGetEdges(g.g, fromPtr, toPtr, retPtr)); err != nil {
+		return nil, -1, err
+	}
+	numEdges = len(from)
+	if len(from) == 0 {
+		return
+	}
+
+	edges = make([]int, len(retVal))
+	for i := range retVal {
+		edges[i] = int(retVal[i])
+	}
+	return
+}
+
+// Node represents a CUDA graph node
+type Node struct{ n C.CUgraphNode }
+
+func (n Node) c() C.CUgraphNode { return n.n }
+func (n Node) String() string   { return fmt.Sprintf("Node_0x%x", uintptr(unsafe.Pointer(n.n))) }
+
+// Destroy destroys the node.
+func (n Node) Destroy() error { return result(C.cuGraphDestroyNode(n.n)) }
+
+// AddChild creates a child node, which executes an embedded graph and adds it to `in`.
+// The result is a new node in the `in` graph, and a handle to that child node will be returned.
+//
+// The childGraph parameter is the graph to clone into the node.
+func (n Node) AddChild(in Graph, children []Node, childGraph Graph) (Node, error) {
+	var retVal Node
+	ptr, numDependencies := unsplatNodes(children)
+	err := result(C.cuGraphAddChildGraphNode(&retVal.n, in.c(), ptr, numDependencies, childGraph.c()))
+	return retVal, err
+}
+
+// ExecGraph represents a CUDA execution graph.
+type ExecGraph struct{ g C.CUgraphExec }
+
+func (g ExecGraph) c() C.CUgraphExec { return g.g }
+func (g ExecGraph) String() string {
+	return fmt.Sprintf("ExecGraph_0x%x", uintptr(unsafe.Pointer(g.g)))
+}
+
+// Destroy destroys the execution graph.
+func (g ExecGraph) Destroy() error { return result(C.cuGraphExecDestroy(g.g)) }
+
+/* utility functions */
+
+// unsplatNodes takes a Go slice and converts it to pointers and size so that it can be passed into C.
+//
+// This works because a Node is just an empty struct around a C.cuGraphNode (i.e. they are the same size). If Node's definition change, then we will have to allocate a new slice, copy the C.cuGraphNode into that slice, then unsplat.
+func unsplatNodes(a []Node) (cunode *C.CUgraphNode, size C.size_t) {
+	size = C.size_t(len(a))
+	if len(a) > 0 {
+		cunode = (*C.CUgraphNode)(unsafe.Pointer(&a[0]))
+		return cunode, size
+	}
+	return nil, 0
+}
diff --git a/hostfunction.go b/hostfunction.go
new file mode 100644
index 0000000..892a324
--- /dev/null
+++ b/hostfunction.go
@@ -0,0 +1,4 @@
+package cu
+
+// HostFunction is a closure of a function call with its data.
+type HostFunction func()
diff --git a/params.go b/params.go
new file mode 100644
index 0000000..0938730
--- /dev/null
+++ b/params.go
@@ -0,0 +1,148 @@
+package cu
+
+/*
+#include <cuda.h>
+
+void CallHostFunc(void* fn){
+	handleCUDACB(fn);
+};
+*/
+import "C"
+import "unsafe"
+
+// KernelNodeParams represents the parameters to launch a kernel in a graph node.
+type KernelNodeParams struct {
+	Func           Function
+	GridDimX       uint
+	GridDimY       uint
+	GridDimZ       uint
+	BlockDimX      uint
+	BlockDimY      uint
+	BlockDimZ      uint
+	SharedMemBytes uint
+
+	Params []*KernelNodeParams
+}
+
+func (p *KernelNodeParams) c() *C.CUDA_KERNEL_NODE_PARAMS {
+	// here anonymous initialization of struct fields is used because `func` is a keyword.
+	// see also: https://github.com/golang/go/issues/41968
+	retVal := &C.CUDA_KERNEL_NODE_PARAMS{
+		p.Func.fn,
+		C.uint(p.GridDimX),
+		C.uint(p.GridDimY),
+		C.uint(p.GridDimZ),
+		C.uint(p.BlockDimX),
+		C.uint(p.BlockDimY),
+		C.uint(p.BlockDimZ),
+		C.uint(p.SharedMemBytes),
+		nil,
+		nil,
+	}
+	return retVal
+}
+
+// HostNodeParams are parameters passed in to a node that will call a host function (i.e. a function written in Go)
+type HostNodeParams struct {
+	Func HostFunction
+	Data unsafe.Pointer
+
+	registered bool
+	ptr        unsafe.Pointer
+}
+
+func (p *HostNodeParams) c() *C.CUDA_HOST_NODE_PARAMS {
+	var ptr unsafe.Pointer
+	if p.registered {
+		ptr = p.ptr
+	} else {
+		ptr = RegisterFunc(p.Func)
+		p.ptr = ptr
+		p.registered = true
+	}
+
+	return &C.CUDA_HOST_NODE_PARAMS{
+		fn:       C.CUhostFn(C.CallHostFunc),
+		userData: ptr, // userData is basically the Go function to call.
+	}
+}
+
+type CopyParams struct {
+	SrcXInBytes  uint64
+	SrcY         uint64
+	SrcZ         uint64
+	SrcLOD       uint64
+	SrcType      MemoryType
+	SrcHost      unsafe.Pointer
+	SrcDevicePtr DevicePtr
+	SrcArray     Array
+	Reserved0    unsafe.Pointer
+	SrcPitch     uint64
+	SrcHeight    uint64
+
+	DstXInBytes  uint64
+	DstY         uint64
+	DstZ         uint64
+	DstLOD       uint64
+	DstType      MemoryType
+	DstHost      unsafe.Pointer
+	DstDevicePtr DevicePtr
+	DstArray     Array
+	Reserved1    unsafe.Pointer
+	DstPitch     uint64
+	DstHeight    uint64
+
+	WidthInBytes uint64
+	Height       uint64
+	Depth        uint64
+}
+
+func (p *CopyParams) c() *C.CUDA_MEMCPY3D {
+	return &C.CUDA_MEMCPY3D{
+		srcXInBytes:   C.size_t(p.SrcXInBytes),
+		srcY:          C.size_t(p.SrcY),
+		srcZ:          C.size_t(p.SrcZ),
+		srcLOD:        C.size_t(p.SrcLOD),
+		srcMemoryType: C.CUmemorytype(p.SrcType),
+		srcHost:       p.SrcHost,
+		srcDevice:     C.CUdeviceptr(p.SrcDevicePtr),
+		srcArray:      p.SrcArray.c(),
+		reserved0:     nil,
+		srcPitch:      C.size_t(p.SrcPitch),
+		srcHeight:     C.size_t(p.SrcHeight),
+		dstXInBytes:   C.size_t(p.DstXInBytes),
+		dstY:          C.size_t(p.DstY),
+		dstZ:          C.size_t(p.DstZ),
+		dstLOD:        C.size_t(p.DstLOD),
+		dstMemoryType: C.CUmemorytype(p.DstType),
+		dstHost:       p.DstHost,
+		dstDevice:     C.CUdeviceptr(p.DstDevicePtr),
+		dstArray:      p.DstArray.c(),
+		reserved1:     nil,
+		dstPitch:      C.size_t(p.DstPitch),
+		dstHeight:     C.size_t(p.DstHeight),
+		WidthInBytes:  C.size_t(p.WidthInBytes),
+		Height:        C.size_t(p.Height),
+		Depth:         C.size_t(p.Depth),
+	}
+}
+
+type MemsetParams struct {
+	Dst         DevicePtr
+	Pitch       uint64
+	Value       uint
+	ElementSize uint
+	Width       uint64
+	Height      uint64
+}
+
+func (p *MemsetParams) c() *C.CUDA_MEMSET_NODE_PARAMS {
+	return &C.CUDA_MEMSET_NODE_PARAMS{
+		dst:         C.CUdeviceptr(p.Dst),
+		pitch:       C.size_t(p.Pitch),
+		value:       C.uint(p.Value),
+		elementSize: C.uint(p.ElementSize),
+		width:       C.size_t(p.Width),
+		height:      C.size_t(p.Height),
+	}
+}
diff --git a/result.go b/result.go
index f0773c9..ab34aa5 100644
--- a/result.go
+++ b/result.go
@@ -4,16 +4,18 @@ package cu
 import "C"
 import "fmt"
 
+// This file was generated by the genlib program. DO NOT EDIT
+
 // cuResult is the Go version of CUresult:
 // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc6c391505e117393cc2558fff6bfc2e9
 type cuResult int
 
-func (err cuResult) Error() string  { return err.String() }
+func (err cuResult) Error() string { return err.String() }
 func (err cuResult) String() string {
-    if msg, ok := resString[err]; ok {
-        return msg
-    }
-    return fmt.Sprintf("UnknownErrorCode:%d", err)
+	if msg, ok := resString[err]; ok {
+		return msg
+	}
+	return fmt.Sprintf("UnknownErrorCode:%d", err)
 }
 
 func result(x C.CUresult) error {
@@ -25,65 +27,82 @@ func result(x C.CUresult) error {
 }
 
 const (
-	Success                     cuResult = C.CUDA_SUCCESS                              // API call returned with no errors
-	InvalidValue                cuResult = C.CUDA_ERROR_INVALID_VALUE                  // This indicates that one or more of the parameters passed to the API call is not within an acceptable range of values.
-	OutOfMemory                 cuResult = C.CUDA_ERROR_OUT_OF_MEMORY                  // The API call failed because it was unable to allocate enough memory to perform the requested operation.
-	NotInitialized              cuResult = C.CUDA_ERROR_NOT_INITIALIZED                // This indicates that the CUDA driver has not been initialized with cuInit() or that initialization has failed.
-	Deinitialized               cuResult = C.CUDA_ERROR_DEINITIALIZED                  // This indicates that the CUDA driver is in the process of shutting down.
-	ProfilerDisabled            cuResult = C.CUDA_ERROR_PROFILER_DISABLED              // This indicates profiler is not initialized for this run. This can happen when the application is running with external profiling tools like visual profiler.
-	ProfilerNotInitialized      cuResult = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED       // Deprecated: This error return is deprecated as of CUDA 5.0. It is no longer an error to attempt to enable/disable the profiling via cuProfilerStart or cuProfilerStop without initialization.
-	ProfilerAlreadyStarted      cuResult = C.CUDA_ERROR_PROFILER_ALREADY_STARTED       // Deprecated: This error return is deprecated as of CUDA 5.0. It is no longer an error to call cuProfilerStart() when profiling is already enabled.
-	ProfilerAlreadyStopped      cuResult = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED       // Deprecated: This error return is deprecated as of CUDA 5.0. It is no longer an error to call cuProfilerStop() when profiling is already disabled.
-	NoDevice                    cuResult = C.CUDA_ERROR_NO_DEVICE                      // This indicates that no CUDA-capable devices were detected by the installed CUDA driver.
-	InvalidDevice               cuResult = C.CUDA_ERROR_INVALID_DEVICE                 // This indicates that the device ordinal supplied by the user does not correspond to a valid CUDA device.
-	InvalidImage                cuResult = C.CUDA_ERROR_INVALID_IMAGE                  // This indicates that the device kernel image is invalid. This can also indicate an invalid CUDA module.
-	InvalidContext              cuResult = C.CUDA_ERROR_INVALID_CONTEXT                // This most frequently indicates that there is no context bound to the current thread. This can also be returned if the context passed to an API call is not a valid handle (such as a context that has had cuCtxDestroy() invoked on it). This can also be returned if a user mixes different API versions (i.e. 3010 context with 3020 API calls). See cuCtxGetApiVersion() for more details.
-	ContextAlreadyCurrent       cuResult = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT        // Deprecated: This error return is deprecated as of CUDA 3.2. It is no longer an error to attempt to push the active context via cuCtxPushCurrent(). This indicated that the context being supplied as a parameter to the API call was already the active context.
-	MapFailed                   cuResult = C.CUDA_ERROR_MAP_FAILED                     // This indicates that a map or register operation has failed.
-	UnmapFailed                 cuResult = C.CUDA_ERROR_UNMAP_FAILED                   // This indicates that an unmap or unregister operation has failed.
-	ArrayIsMapped               cuResult = C.CUDA_ERROR_ARRAY_IS_MAPPED                // This indicates that the specified array is currently mapped and thus cannot be destroyed.
-	AlreadyMapped               cuResult = C.CUDA_ERROR_ALREADY_MAPPED                 // This indicates that the resource is already mapped.
-	NoBinaryForGpu              cuResult = C.CUDA_ERROR_NO_BINARY_FOR_GPU              // This indicates that there is no kernel image available that is suitable for the device. This can occur when a user specifies code generation options for a particular CUDA source file that do not include the corresponding device configuration.
-	AlreadyAcquired             cuResult = C.CUDA_ERROR_ALREADY_ACQUIRED               // This indicates that a resource has already been acquired.
-	NotMapped                   cuResult = C.CUDA_ERROR_NOT_MAPPED                     // This indicates that a resource is not mapped.
-	NotMappedAsArray            cuResult = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY            // This indicates that a mapped resource is not available for access as an array.
-	NotMappedAsPointer          cuResult = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER          // This indicates that a mapped resource is not available for access as a pointer.
-	EccUncorrectable            cuResult = C.CUDA_ERROR_ECC_UNCORRECTABLE              // This indicates that an uncorrectable ECC error was detected during execution.
-	UnsupportedLimit            cuResult = C.CUDA_ERROR_UNSUPPORTED_LIMIT              // This indicates that the CUlimit passed to the API call is not supported by the active device.
-	ContextAlreadyInUse         cuResult = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE         // This indicates that the CUcontext passed to the API call can only be bound to a single CPU thread at a time but is already bound to a CPU thread.
-	PeerAccessUnsupported       cuResult = C.CUDA_ERROR_PEER_ACCESS_UNSUPPORTED        // This indicates that peer access is not supported across the given devices.
-	InvalidPtx                  cuResult = C.CUDA_ERROR_INVALID_PTX                    // This indicates that a PTX JIT compilation failed.
-	InvalidGraphicsContext      cuResult = C.CUDA_ERROR_INVALID_GRAPHICS_CONTEXT       // This indicates an error with OpenGL or DirectX context.
-	NvlinkUncorrectable         cuResult = C.CUDA_ERROR_NVLINK_UNCORRECTABLE           // This indicates that an uncorrectable NVLink error was detected during the execution.
-	InvalidSource               cuResult = C.CUDA_ERROR_INVALID_SOURCE                 // This indicates that the device kernel source is invalid.
-	FileNotFound                cuResult = C.CUDA_ERROR_FILE_NOT_FOUND                 // This indicates that the file specified was not found.
-	SharedObjectSymbolNotFound  cuResult = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND // This indicates that a link to a shared object failed to resolve.
-	SharedObjectInitFailed      cuResult = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      // This indicates that initialization of a shared object failed.
-	OperatingSystem             cuResult = C.CUDA_ERROR_OPERATING_SYSTEM               // This indicates that an OS call failed.
-	InvalidHandle               cuResult = C.CUDA_ERROR_INVALID_HANDLE                 // This indicates that a resource handle passed to the API call was not valid. Resource handles are opaque types like CUstream and CUevent.
-	NotFound                    cuResult = C.CUDA_ERROR_NOT_FOUND                      // This indicates that a named symbol was not found. Examples of symbols are global/constant variable names, texture names, and surface names.
-	NotReady                    cuResult = C.CUDA_ERROR_NOT_READY                      // This indicates that asynchronous operations issued previously have not completed yet. This result is not actually an error, but must be indicated differently than CUDA_SUCCESS (which indicates completion). Calls that may return this value include cuEventQuery() and cuStreamQuery().
-	IllegalAddress              cuResult = C.CUDA_ERROR_ILLEGAL_ADDRESS                // While executing a kernel, the device encountered a load or store instruction on an invalid memory address. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
-	LaunchOutOfResources        cuResult = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        // This indicates that a launch did not occur because it did not have appropriate resources. This error usually indicates that the user has attempted to pass too many arguments to the device kernel, or the kernel launch specifies too many threads for the kernel's register count. Passing arguments of the wrong size (i.e. a 64-bit pointer when a 32-bit int is expected) is equivalent to passing too many arguments and can also result in this error.
-	LaunchTimeout               cuResult = C.CUDA_ERROR_LAUNCH_TIMEOUT                 // This indicates that the device kernel took too long to execute. This can only occur if timeouts are enabled - see the device attribute CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
-	LaunchIncompatibleTexturing cuResult = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  // This error indicates a kernel launch that uses an incompatible texturing mode.
-	PeerAccessAlreadyEnabled    cuResult = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    // This error indicates that a call to cuCtxEnablePeerAccess() is trying to re-enable peer access to a context which has already had peer access to it enabled.
-	PeerAccessNotEnabled        cuResult = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        // This error indicates that cuCtxDisablePeerAccess() is trying to disable peer access which has not been enabled yet via cuCtxEnablePeerAccess().
-	PrimaryContextActive        cuResult = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         // This error indicates that the primary context for the specified device has already been initialized.
-	ContextIsDestroyed          cuResult = C.CUDA_ERROR_CONTEXT_IS_DESTROYED           // This error indicates that the context current to the calling thread has been destroyed using cuCtxDestroy, or is a primary context which has not yet been initialized.
-	Assert                      cuResult = C.CUDA_ERROR_ASSERT                         // A device-side assert triggered during kernel execution. The context cannot be used anymore, and must be destroyed. All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA.
-	TooManyPeers                cuResult = C.CUDA_ERROR_TOO_MANY_PEERS                 // This error indicates that the hardware resources required to enable peer access have been exhausted for one or more of the devices passed to cuCtxEnablePeerAccess().
-	HostMemoryAlreadyRegistered cuResult = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED // This error indicates that the memory range passed to cuMemHostRegister() has already been registered.
-	HostMemoryNotRegistered     cuResult = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     // This error indicates that the pointer passed to cuMemHostUnregister() does not correspond to any currently registered memory region.
-	HardwareStackError          cuResult = C.CUDA_ERROR_HARDWARE_STACK_ERROR           // While executing a kernel, the device encountered a stack error. This can be due to stack corruption or exceeding the stack size limit. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
-	IllegalInstruction          cuResult = C.CUDA_ERROR_ILLEGAL_INSTRUCTION            // While executing a kernel, the device encountered an illegal instruction. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
-	MisalignedAddress           cuResult = C.CUDA_ERROR_MISALIGNED_ADDRESS             // While executing a kernel, the device encountered a load or store instruction on a memory address which is not aligned. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
-	InvalidAddressSpace         cuResult = C.CUDA_ERROR_INVALID_ADDRESS_SPACE          // While executing a kernel, the device encountered an instruction which can only operate on memory locations in certain address spaces (global, shared, or local), but was supplied a memory address not belonging to an allowed address space. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
-	InvalidPc                   cuResult = C.CUDA_ERROR_INVALID_PC                     // While executing a kernel, the device program counter wrapped its address space. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
-	LaunchFailed                cuResult = C.CUDA_ERROR_LAUNCH_FAILED                  // An exception occurred on the device while executing a kernel. Common causes include dereferencing an invalid device pointer and accessing out of bounds shared memory. This leaves the process in an inconsistent state and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched.
-	NotPermitted                cuResult = C.CUDA_ERROR_NOT_PERMITTED                  // This error indicates that the attempted operation is not permitted.
-	NotSupported                cuResult = C.CUDA_ERROR_NOT_SUPPORTED                  // This error indicates that the attempted operation is not supported on the current system or device.
-	Unknown                     cuResult = C.CUDA_ERROR_UNKNOWN                        // This indicates that an unknown internal error has occurred.
+	Success                     cuResult = C.CUDA_SUCCESS
+	InvalidValue                cuResult = C.CUDA_ERROR_INVALID_VALUE
+	OutOfMemory                 cuResult = C.CUDA_ERROR_OUT_OF_MEMORY
+	NotInitialized              cuResult = C.CUDA_ERROR_NOT_INITIALIZED
+	Deinitialized               cuResult = C.CUDA_ERROR_DEINITIALIZED
+	ProfilerDisabled            cuResult = C.CUDA_ERROR_PROFILER_DISABLED
+	ProfilerNotInitialized      cuResult = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED
+	ProfilerAlreadyStarted      cuResult = C.CUDA_ERROR_PROFILER_ALREADY_STARTED
+	ProfilerAlreadyStopped      cuResult = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED
+	NoDevice                    cuResult = C.CUDA_ERROR_NO_DEVICE
+	InvalidDevice               cuResult = C.CUDA_ERROR_INVALID_DEVICE
+	InvalidImage                cuResult = C.CUDA_ERROR_INVALID_IMAGE
+	InvalidContext              cuResult = C.CUDA_ERROR_INVALID_CONTEXT
+	ContextAlreadyCurrent       cuResult = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT
+	MapFailed                   cuResult = C.CUDA_ERROR_MAP_FAILED
+	UnmapFailed                 cuResult = C.CUDA_ERROR_UNMAP_FAILED
+	ArrayIsMapped               cuResult = C.CUDA_ERROR_ARRAY_IS_MAPPED
+	AlreadyMapped               cuResult = C.CUDA_ERROR_ALREADY_MAPPED
+	NoBinaryForGpu              cuResult = C.CUDA_ERROR_NO_BINARY_FOR_GPU
+	AlreadyAcquired             cuResult = C.CUDA_ERROR_ALREADY_ACQUIRED
+	NotMapped                   cuResult = C.CUDA_ERROR_NOT_MAPPED
+	NotMappedAsArray            cuResult = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY
+	NotMappedAsPointer          cuResult = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER
+	EccUncorrectable            cuResult = C.CUDA_ERROR_ECC_UNCORRECTABLE
+	UnsupportedLimit            cuResult = C.CUDA_ERROR_UNSUPPORTED_LIMIT
+	ContextAlreadyInUse         cuResult = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE
+	PeerAccessUnsupported       cuResult = C.CUDA_ERROR_PEER_ACCESS_UNSUPPORTED
+	InvalidPtx                  cuResult = C.CUDA_ERROR_INVALID_PTX
+	InvalidGraphicsContext      cuResult = C.CUDA_ERROR_INVALID_GRAPHICS_CONTEXT
+	NvlinkUncorrectable         cuResult = C.CUDA_ERROR_NVLINK_UNCORRECTABLE
+	JitCompilerNotFound         cuResult = C.CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+	InvalidSource               cuResult = C.CUDA_ERROR_INVALID_SOURCE
+	FileNotFound                cuResult = C.CUDA_ERROR_FILE_NOT_FOUND
+	SharedObjectSymbolNotFound  cuResult = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND
+	SharedObjectInitFailed      cuResult = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+	OperatingSystem             cuResult = C.CUDA_ERROR_OPERATING_SYSTEM
+	InvalidHandle               cuResult = C.CUDA_ERROR_INVALID_HANDLE
+	IllegalState                cuResult = C.CUDA_ERROR_ILLEGAL_STATE
+	NotFound                    cuResult = C.CUDA_ERROR_NOT_FOUND
+	NotReady                    cuResult = C.CUDA_ERROR_NOT_READY
+	IllegalAddress              cuResult = C.CUDA_ERROR_ILLEGAL_ADDRESS
+	LaunchOutOfResources        cuResult = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
+	LaunchTimeout               cuResult = C.CUDA_ERROR_LAUNCH_TIMEOUT
+	LaunchIncompatibleTexturing cuResult = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING
+	PeerAccessAlreadyEnabled    cuResult = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED
+	PeerAccessNotEnabled        cuResult = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
+	PrimaryContextActive        cuResult = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
+	ContextIsDestroyed          cuResult = C.CUDA_ERROR_CONTEXT_IS_DESTROYED
+	Assert                      cuResult = C.CUDA_ERROR_ASSERT
+	TooManyPeers                cuResult = C.CUDA_ERROR_TOO_MANY_PEERS
+	HostMemoryAlreadyRegistered cuResult = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED
+	HostMemoryNotRegistered     cuResult = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED
+	HardwareStackError          cuResult = C.CUDA_ERROR_HARDWARE_STACK_ERROR
+	IllegalInstruction          cuResult = C.CUDA_ERROR_ILLEGAL_INSTRUCTION
+	MisalignedAddress           cuResult = C.CUDA_ERROR_MISALIGNED_ADDRESS
+	InvalidAddressSpace         cuResult = C.CUDA_ERROR_INVALID_ADDRESS_SPACE
+	InvalidPc                   cuResult = C.CUDA_ERROR_INVALID_PC
+	LaunchFailed                cuResult = C.CUDA_ERROR_LAUNCH_FAILED
+	CooperativeLaunchTooLarge   cuResult = C.CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE
+	NotPermitted                cuResult = C.CUDA_ERROR_NOT_PERMITTED
+	NotSupported                cuResult = C.CUDA_ERROR_NOT_SUPPORTED
+	SystemNotReady              cuResult = C.CUDA_ERROR_SYSTEM_NOT_READY
+	SystemDriverMismatch        cuResult = C.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH
+	CompatNotSupportedOnDevice  cuResult = C.CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE
+	StreamCaptureUnsupported    cuResult = C.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
+	StreamCaptureInvalidated    cuResult = C.CUDA_ERROR_STREAM_CAPTURE_INVALIDATED
+	StreamCaptureMerge          cuResult = C.CUDA_ERROR_STREAM_CAPTURE_MERGE
+	StreamCaptureUnmatched      cuResult = C.CUDA_ERROR_STREAM_CAPTURE_UNMATCHED
+	StreamCaptureUnjoined       cuResult = C.CUDA_ERROR_STREAM_CAPTURE_UNJOINED
+	StreamCaptureIsolation      cuResult = C.CUDA_ERROR_STREAM_CAPTURE_ISOLATION
+	StreamCaptureImplicit       cuResult = C.CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
+	CapturedEvent               cuResult = C.CUDA_ERROR_CAPTURED_EVENT
+	StreamCaptureWrongThread    cuResult = C.CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD
+	Timeout                     cuResult = C.CUDA_ERROR_TIMEOUT
+	GraphExecUpdateFailure      cuResult = C.CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE
+	Unknown                     cuResult = C.CUDA_ERROR_UNKNOWN
 )
 
 var resString = map[cuResult]string{
@@ -117,12 +136,14 @@ var resString = map[cuResult]string{
 	InvalidPtx:                  "InvalidPtx",
 	InvalidGraphicsContext:      "InvalidGraphicsContext",
 	NvlinkUncorrectable:         "NvlinkUncorrectable",
+	JitCompilerNotFound:         "JitCompilerNotFound",
 	InvalidSource:               "InvalidSource",
 	FileNotFound:                "FileNotFound",
 	SharedObjectSymbolNotFound:  "SharedObjectSymbolNotFound",
 	SharedObjectInitFailed:      "SharedObjectInitFailed",
 	OperatingSystem:             "OperatingSystem",
 	InvalidHandle:               "InvalidHandle",
+	IllegalState:                "IllegalState",
 	NotFound:                    "NotFound",
 	NotReady:                    "NotReady",
 	IllegalAddress:              "IllegalAddress",
@@ -143,7 +164,22 @@ var resString = map[cuResult]string{
 	InvalidAddressSpace:         "InvalidAddressSpace",
 	InvalidPc:                   "InvalidPc",
 	LaunchFailed:                "LaunchFailed",
+	CooperativeLaunchTooLarge:   "CooperativeLaunchTooLarge",
 	NotPermitted:                "NotPermitted",
 	NotSupported:                "NotSupported",
+	SystemNotReady:              "SystemNotReady",
+	SystemDriverMismatch:        "SystemDriverMismatch",
+	CompatNotSupportedOnDevice:  "CompatNotSupportedOnDevice",
+	StreamCaptureUnsupported:    "StreamCaptureUnsupported",
+	StreamCaptureInvalidated:    "StreamCaptureInvalidated",
+	StreamCaptureMerge:          "StreamCaptureMerge",
+	StreamCaptureUnmatched:      "StreamCaptureUnmatched",
+	StreamCaptureUnjoined:       "StreamCaptureUnjoined",
+	StreamCaptureIsolation:      "StreamCaptureIsolation",
+	StreamCaptureImplicit:       "StreamCaptureImplicit",
+	CapturedEvent:               "CapturedEvent",
+	StreamCaptureWrongThread:    "StreamCaptureWrongThread",
+	Timeout:                     "Timeout",
+	GraphExecUpdateFailure:      "GraphExecUpdateFailure",
 	Unknown:                     "Unknown",
 }
diff --git a/test_test.go b/test_test.go
index ecf98af..9eb13e4 100644
--- a/test_test.go
+++ b/test_test.go
@@ -10,8 +10,8 @@ const add32PTX = `//
 // Based on LLVM 3.4svn
 //
 
-.version 5.0
-.target sm_20
+.version 6.5
+.target sm_75
 .address_size 64
 
 	// .globl	add32
@@ -49,7 +49,7 @@ const add32PTX = `//
 	mad.lo.s32 	%r18, %r14, %r11, %r17;
 	mad.lo.s32 	%r1, %r12, %r16, %r18;
 	setp.ge.s32	%p1, %r1, %r2;
-	@%p1 bra 	BB0_2;
+	@%p1 bra 	BB1_2;
 
 	cvta.to.global.u64 	%rd3, %rd1;
 	mul.wide.s32 	%rd4, %r1, 4;
@@ -58,10 +58,10 @@ const add32PTX = `//
 	add.s64 	%rd7, %rd6, %rd4;
 	ld.global.f32 	%f1, [%rd7];
 	ld.global.f32 	%f2, [%rd5];
-	add.f32 	%f3, %f2, %f1;
+	add.rn.f32 	%f3, %f2, %f1;
 	st.global.f32 	[%rd5], %f3;
 
-BB0_2:
+BB1_2:
 	ret;
 }