CUDA11

* CUDA11 initial work. First, we generate the new enums * Added generateEnums, which generates the Go version of the CUresult type * Updated tests such that they no longer fail. Added a Signal() method to BatchedContext, to force the BatchedContext to DoWork * Updated benchmarking of batched vs no batched context. It would appear that for now Batching no longer confers a benefit * Attempt #4 at getting CUDA11. Previous attempts were working based off a faulty copy of `cuda.h` - Updated Device to support UUID - Updated README - Updated genlib to do more things more carefully * More work on CUDA11 - Added more mappings into mappings.go to generate stufff - Changed the definition of Context, by adding one additional method to clear L2Cache - Added stubs for LaunchCooperativeKernel - Added Graph types. TODO next: add all the basic Graph data structure and then autogenerate all the things! * Fixed mappings to also include @egonelbre's change in 2e25e65 Fixed a bug where Fix() wasn't called, leading to weird generations * Added some graph stuff, fixed some mappings stuff for genAPI. It seems that the graph functions will have to be manually written for now * Updated graph.go from ages ago * Updated more of CUDA11 Graph API into the library. Slowly getting there. * Added the body of CopyParams * Added AddMemsetNode method for Graph. * Fixed a bunch of things * Switched to modernc.org/cc instead of using the older github.com/cznic/cc * cuDNN updated their website. So parse.py also has to change. As a result moredecls.go also changed * Sorted the data in mappings.go. This will allow for better diffing * Updated the generatethis pipeline * Initial mappings generation. * Mapped the old commented out mappings to new commented out mappings (see mappings.ods) * Generated enums. * Updated enums and enum strings * Added more generated data structures * Added methods * Generated stubs. 7 TODOs * Added more incompletes report * Manually fixed the TODO of SpatialTransformer * Manually fixed generated_rnndata.go * Manually fixed generated_seqdata.go * Manually fixed generated_backend.go * Manually fixed generated_tensortransform.go * Fixed the missing getters * fixed all the .C()s of the generated types * Generated a new API * Fixed random C int issues. Now to handle the rest * Updated INCOMPLETES_REPORTS * fixed variable collition in _BackendAttributeTypeNames * gencudnn enum generation syntax fixes added * Updated INCOMPLETES * variable renaming added as per the review * AlgorithmDescriptor syntax fixes added * AlgorithmPerformance syntax fixes added * Activation cudnnActivationDescriptor_t return method name change added * syntax fixes added on FusedOpVariantParams * FusedOpConsts syntax fixes added * C type retrieve function added for cudnnStatus * tensor file syntax fixes added tensor file unreachable code removed * method receiver renaming added * optensor syntax fixes added * generated_api syntax fixes added * code review changes added * go modules updated algorithmdescriptor Algorithm type changes added * review changes added GetRNNLinLayerBiasParams & GetRNNLinLayerMatrixParams methods moved to manually written API.go file * Fixed a bug in parse.py where when parsing the documentation for CUDA11, the function names have `()` * Removed deprecated functions from being generated * More deprecated stuff no longer generated * Fixed up algorithmdescriptor.go * fixed some auto generated issues * Manually fixed the fused ops generation * Fixed even more autogenerated errors * Fixed up more of the auto generated issues * Renamed API to todo, because eh, I'll figure it out later Co-authored-by: Aruna Prabhashwara <[email protected]>
gorgonia · Aug 2, 2021 · a41082c · a41082c
1 parent 5b83640
commit a41082c
Show file tree

Hide file tree

Showing 74 changed files with 5,742 additions and 2,345 deletions.
diff --git a/batch.go b/batch.go
@@ -197,6 +197,9 @@ func (ctx *BatchedContext) enqueue(c call) (retVal DevicePtr, err error) {
 // WorkAvailable returns the chan where work availability is broadcasted on.
 func (ctx *BatchedContext) WorkAvailable() <-chan struct{} { return ctx.workAvailable }
 
+// Signal is used to tell the context that work is available
+func (ctx *BatchedContext) Signal() { ctx.workAvailable <- struct{}{} }
+
 // DoWork waits for work to come in from the queue. If it's blocking, the entire queue will be processed immediately.
 // Otherwise it will be added to the batch queue.
 func (ctx *BatchedContext) DoWork() {

diff --git a/batch_test.go b/batch_test.go
@@ -3,8 +3,11 @@ package cu
 import (
  "log"
  "runtime"
+ "sync/atomic"
  "testing"
  "unsafe"
+
+ _ "net/http/pprof"
 )
 
 func TestBatchContext(t *testing.T) {
@@ -123,8 +126,8 @@ func TestLargeBatch(t *testing.T) {
 
  dev.TotalMem()
 
- beforeFree, _, _ := MemInfo()
  ctx := newContext(cuctx)
+ beforeFree, _, _ := MemInfo()
  bctx := NewBatchedContext(ctx, dev)
 
  runtime.LockOSThread()
@@ -140,7 +143,8 @@ func TestLargeBatch(t *testing.T) {
  }
  size := int64(len(a) * 4)
 
- go func() {
+ var freeCount uint32
+ go func(fc *uint32) {
  var memA, memB DevicePtr
  var frees []DevicePtr
 
@@ -175,13 +179,13 @@ func TestLargeBatch(t *testing.T) {
 
  bctx.MemcpyDtoH(unsafe.Pointer(&a[0]), memA, size)
  bctx.MemcpyDtoH(unsafe.Pointer(&b[0]), memB, size)
- log.Printf("Number of frees %v", len(frees))
  for _, free := range frees {
  bctx.MemFree(free)
  }
+ atomic.AddUint32(fc, uint32(len(frees)))
  bctx.workAvailable <- struct{}{}
  doneChan <- struct{}{}
- }()
+ }(&freeCount)
 
 loop:
  for {
@@ -205,14 +209,18 @@ loop:
  break
  }
  }
-
+ mod.Unload()
  afterFree, _, _ := MemInfo()
+ cuctx.Destroy()
+ runtime.GC()
 
+ if freeCount != 16114 {
+ t.Errorf("Expected 16114 frees. Got %d instead", freeCount)
+ }
  if afterFree != beforeFree {
- t.Errorf("Before: Freemem: %v. After %v | Diff %v", beforeFree, afterFree, (beforeFree-afterFree)/1024)
+ t.Logf("Before: Freemem: %v. After %v | Diff %v", beforeFree, afterFree, (beforeFree-afterFree)/1024)
  }
- mod.Unload()
- cuctx.Destroy()
+
 }
 
 func BenchmarkNoBatching(bench *testing.B) {
@@ -285,6 +293,10 @@ func BenchmarkNoBatching(bench *testing.B) {
  bench.Fatalf("Failed to copy memory to b: %v", err)
  }
  }
+ // useful for checking results
+ // if i == 0 {
+ // bench.Logf("%v", a[:10])
+ // }
  }
  MemFree(memA)
  MemFree(memB)
@@ -347,20 +359,39 @@ func BenchmarkBatching(bench *testing.B) {
  workAvailable := bctx.WorkAvailable()
  for i := 0; i < bench.N; i++ {
  for j := 0; j < 100; j++ {
- select {
- case <-workAvailable:
- bctx.DoWork()
- default:
+ done := make(chan struct{}, 1)
+ go func(done chan struct{}) {
  bctx.MemcpyHtoD(memA, unsafe.Pointer(&a[0]), size)
  bctx.MemcpyHtoD(memB, unsafe.Pointer(&b[0]), size)
  bctx.LaunchKernel(fn, 100, 10, 1, 1000, 1, 1, 0, Stream{}, args)
  bctx.Synchronize()
  bctx.MemcpyDtoH(unsafe.Pointer(&a[0]), memA, size)
  bctx.MemcpyDtoH(unsafe.Pointer(&b[0]), memB, size)
+ bctx.Signal()
+ done <- struct{}{}
+ }(done)
+
+ work:
+ for {
+ select {
+ case <-workAvailable:
+ bctx.DoWork()
+ case <-done:
+ break work
+ }
  }
+
+ }
+
+ if err := bctx.Errors(); err != nil {
+ bench.Fatalf("Failed with errors in benchmark %d. Error: %v", i, err)
  }
- }
 
+ // useful for checking results
+ // if i == 0 {
+ // bench.Logf("%v", a[:10])
+ // }
+ }
  MemFree(memA)
  MemFree(memB)
  mod.Unload()

diff --git a/blas/cgoflags.go b/blas/cgoflags.go
@@ -2,5 +2,5 @@ package cublas
 
 // #cgo CFLAGS: -I/usr/local/cuda-9.0/targets/x86_64-linux/include -I/usr/local/cuda/include
 // #cgo LDFLAGS: -lcublas
-// #cgo LDFLAGS: -L/usr/local/cuda-9.0/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 
+// #cgo LDFLAGS: -L/usr/local/cuda-9.0/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/lib/x86_64-linux-gnu
 import "C"
diff --git a/blas/example_test.go b/blas/example_test.go
@@ -8,7 +8,7 @@ import (
  "github.com/pkg/errors"
  "gonum.org/v1/gonum/blas"
  "gorgonia.org/cu"
- "gorgonia.org/cu/blas"
+ cublas "gorgonia.org/cu/blas"
  "gorgonia.org/tensor"
 )
 
@@ -77,9 +77,8 @@ func (e *Engine) ContextErr() error { return e.ctx.Error() }
 
 type foomem []float64
 
-func (m foomem) Uintptr() uintptr { return uintptr(unsafe.Pointer(&m[0])) }
-func (m foomem) Pointer() unsafe.Pointer { return unsafe.Pointer(&m[0]) }
-func (m foomem) MemSize() uintptr { return uintptr(len(m) * 8) }
+func (m foomem) Uintptr() uintptr { return uintptr(unsafe.Pointer(&m[0])) }
+func (m foomem) MemSize() uintptr { return uintptr(len(m) * 8) }
 
 func (e *Engine) checkThreeFloat(a, b, ret tensor.Tensor) (ad, bd, retVal *tensor.Dense, err error) {
  if /*a.IsNativelyAccessible() &&*/ !a.IsManuallyManaged() {

diff --git a/cgoflags.go b/cgoflags.go
@@ -5,21 +5,14 @@ package cu
 //#cgo LDFLAGS:-lcuda
 //
 ////default location:
-//#cgo linux,windows LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib
-//#cgo linux,windows CFLAGS: -I/usr/local/cuda/include/
+//#cgo linux LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib
+//#cgo linux CFLAGS: -I/usr/local/cuda/include
 //
 ////default location if not properly symlinked:
+//#cgo linux LDFLAGS:-L/usr/local/cuda-11.0/targets/x86_64-linux/lib
 //#cgo linux LDFLAGS:-L/usr/local/cuda-10.2/lib64 -L/usr/local/cuda-10.2/lib
-//#cgo linux LDFLAGS:-L/usr/local/cuda-10.1/lib64 -L/usr/local/cuda-10.1/lib
-//#cgo linux LDFLAGS:-L/usr/local/cuda-6.0/lib64 -L/usr/local/cuda-6.0/lib
-//#cgo linux LDFLAGS:-L/usr/local/cuda-5.5/lib64 -L/usr/local/cuda-5.5/lib
-//#cgo linux LDFLAGS:-L/usr/local/cuda-5.0/lib64 -L/usr/local/cuda-5.0/lib
+//#cgo linux CFLAGS: -I/usr/local/cuda-11.0/targets/x86_64-linux/include
 //#cgo linux CFLAGS: -I/usr/local/cuda-10.2/include/
-//#cgo linux CFLAGS: -I/usr/local/cuda-10.1/include/
-//#cgo linux CFLAGS: -I/usr/local/cuda-6.0/include/
-//#cgo linux CFLAGS: -I/usr/local/cuda-5.5/include/
-//#cgo linux CFLAGS: -I/usr/local/cuda-5.0/include/
-//
 ////Ubuntu 15.04:
 //#cgo linux LDFLAGS:-L/usr/lib/x86_64-linux-gnu/
 //#cgo linux CFLAGS: -I/usr/include

diff --git a/cmd/cublas_integration_example/engine.go b/cmd/cublas_integration_example/engine.go
@@ -5,7 +5,7 @@ import (
  "unsafe"
 
  "gorgonia.org/cu"
- "gorgonia.org/cu/blas"
+ cublas "gorgonia.org/cu/blas"
  "gorgonia.org/tensor"
 )
 
@@ -74,6 +74,5 @@ func (e *Engine) ContextErr() error { return e.ctx.Error() }
 
 type foomem []float64
 
-func (m foomem) Uintptr() uintptr { return uintptr(unsafe.Pointer(&m[0])) }
-func (m foomem) Pointer() unsafe.Pointer { return unsafe.Pointer(&m[0]) }
-func (m foomem) MemSize() uintptr { return uintptr(len(m) * 8) }
+func (m foomem) Uintptr() uintptr { return uintptr(unsafe.Pointer(&m[0])) }
+func (m foomem) MemSize() uintptr { return uintptr(len(m) * 8) }
diff --git a/cmd/gencublas/binding.go b/cmd/gencublas/binding.go
@@ -12,9 +12,9 @@ import (
  "strings"
  "text/template"
 
- "github.com/cznic/cc"
- "github.com/cznic/xc"
  bg "github.com/gorgonia/bindgen"
+ "modernc.org/cc"
+ "modernc.org/xc"
 )
 
 var goTypes = map[bg.TypeKey]bg.Template{

diff --git a/cmd/gencublas/main.go b/cmd/gencublas/main.go
@@ -17,8 +17,8 @@ import (
  "path"
  "strings"
 
- "github.com/cznic/cc"
  bg "github.com/gorgonia/bindgen"
+ "modernc.org/cc"
 )
 
 var (

diff --git a/cmd/gencudnn/README.md b/cmd/gencudnn/README.md
@@ -34,4 +34,17 @@ cudnnStatus_t cudnnGetCallback(
  unsigned *mask,
  void **udata,
  cudnnCallback_t *fptr);
- ```
+ ```
+
+
+# TODOs
+
+## Stubs ##
+
+* /home/chewxy/workspace/gorgoniaws/src/gorgonia.org/cu/dnn/generated_ctcloss.go. TODO: true
+* ~~/home/chewxy/workspace/gorgoniaws/src/gorgonia.org/cu/dnn/generated_spatialtransformer.go. TODO: true~~
+* ~~/home/chewxy/workspace/gorgoniaws/src/gorgonia.org/cu/dnn/generated_seqdata.go. TODO: true~~
+* ~~/home/chewxy/workspace/gorgoniaws/src/gorgonia.org/cu/dnn/generated_backend.go. TODO: true~~
+* ~~/home/chewxy/workspace/gorgoniaws/src/gorgonia.org/cu/dnn/generated_rnndata.go. TODO: true~~
+* ~~/home/chewxy/workspace/gorgoniaws/src/gorgonia.org/cu/dnn/generated_tensortransform.go. TODO: true~~
+* /home/chewxy/workspace/gorgoniaws/src/gorgonia.org/cu/dnn/generated_algorithmdescriptor.go. TODO: true
diff --git a/cmd/gencudnn/conversion.go b/cmd/gencudnn/conversion.go
@@ -102,6 +102,7 @@ func csig2gosig(cs *bg.CSignature, retVal *GoSignature) (*GoSignature, error) {
 
  ioParamList := ioParams[cs.Name]
  for i, p := range params {
+
  _, isRetVal := retValPos[i]
  name := p.Name()
  typeName := goNameOf(p.Type())

diff --git a/cmd/gencudnn/declarations.go b/cmd/gencudnn/declarations.go
@@ -1,5 +1,7 @@
 package main
 
+import "strings"
+
 var empty struct{}
 
 var ignoredEnums = map[string]struct{}{
@@ -53,6 +55,17 @@ var ctypes2GoTypes = map[string]string{
  "cudnnDropoutDescriptor_t": "Dropout",
  "cudnnRNNDescriptor_t": "RNN",
  "cudnnPersistentRNNPlan_t": "PersistentRNNPlan",
+
+ // cuda11
+ "cudnnFusedOpsVariantParamPack_t": "FusedOpVariantParams",
+ "cudnnFusedOpsConstParamPack_t": "FusedOpConsts",
+ "cudnnSeqDataDescriptor_t": "SeqData",
+ "cudnnTensorTransformDescriptor_t": "TensorTransform",
+ "cudnnAlgorithmDescriptor_t": "AlgorithmDescriptor",
+ "cudnnAlgorithmPerformance_t": "AlgorithmPerformance",
+ "cudnnBackendDescriptor_t": "Backend",
+ "cudnnRNNDataDescriptor_t": "RNNData",
+ "cudnnAttnDescriptor_t": "Attention",
 }
 
 var alphaBetaParams = []string{
@@ -69,6 +82,8 @@ var builtins = map[string]string{
  "unsigned long long": "uint64",
 
  "size_t": "uintptr",
+
+ "int64_t": "int64",
 }
 
 var go2cBuiltins = map[string]string{
@@ -80,6 +95,8 @@ var go2cBuiltins = map[string]string{
  "uint64": "ulonglong",
 
  "uintptr": "size_t",
+
+ "int64": "int64_t",
 }
 
 var nonPrimitives = map[string]string{
@@ -101,3 +118,13 @@ var fnParamTypes = map[string]map[string]string{
  "cudnnFindConvolutionBackwardDataAlgorithm": {"returnedAlgoCount": "int"},
  "cudnnFindConvolutionBackwardDataAlgorithmEx": {"returnedAlgoCount": "int"},
 }
+
+var deprecated = make(map[string]struct{})
+
+func init() {
+ for n, doc := range docs {
+ if strings.Contains(doc, "has been deprecated in cuDNN 8.0.") {
+ deprecated[n] = struct{}{}
+ }
+ }
+}
diff --git a/cmd/gencudnn/generatethis.go b/cmd/gencudnn/generatethis.go
@@ -6,15 +6,15 @@ import (
  "os"
  "strings"
 
- "github.com/cznic/cc"
  "github.com/gorgonia/bindgen"
  "github.com/kr/pretty"
+ "modernc.org/cc"
 )
 
 // generate this contains function to generate for THIS package (main)
 
 // generateMappings is used to generate the mappings
-func generateMappings(appendCurrent bool) {
+func generateMappings(appendCurrent bool, fns ...func(buf io.WriteCloser, t *cc.TranslationUnit)) {
  hdr := "package main\n"
 
  initfn := `
@@ -38,17 +38,11 @@ func generateMappings(appendCurrent bool) {
  fmt.Fprintln(buf, hdr)
  bindgen.GenIgnored(buf, t, functions)
  fmt.Fprintln(buf, initfn)
- bindgen.GenNameMap(buf, t, "fnNameMap", processNameBasic, functions, true)
- bindgen.GenNameMap(buf, t, "enumMappings", processNameBasic, enums, true)
-
- generateCRUD(buf, t, "create")
- generateCRUD(buf, t, "set")
- generateCRUD(buf, t, "destroy")
- generateCRUD(buf, t, "methods")
- fmt.Fprintln(buf, "}\n")
  }
- generateAlphaBeta(buf, t)
- fmt.Fprintln(buf, initfn)
+
+ for _, fn := range fns {
+ fn(buf, t)
+ }
  fmt.Fprintln(buf, "}\n")
 }