diff --git a/NDTensors/Project.toml b/NDTensors/Project.toml index 23f556c2a1..d7bd8ca521 100644 --- a/NDTensors/Project.toml +++ b/NDTensors/Project.toml @@ -14,7 +14,6 @@ EllipsisNotation = "da5c29d0-fa7d-589e-88eb-ea29b0a81949" FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" Folds = "41a02a25-b8f0-4f67-bc48-60067656b558" Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" -GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" HalfIntegers = "f0d1745a-41c9-11e9-1dd9-e5d34d218721" InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -35,6 +34,7 @@ VectorInterface = "409d34a3-91d5-4945-b6ec-7529ddf182d8" [weakdeps] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" Metal = "dde4c033-4e86-420c-a63e-0dd931031962" Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4" @@ -42,10 +42,11 @@ TBLIS = "48530278-0828-4a49-9772-0f3830dfa1e9" cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1" [extensions] -NDTensorsAMDGPUExt = "AMDGPU" -NDTensorsCUDAExt = "CUDA" +NDTensorsAMDGPUExt = ["AMDGPU","GPUArraysCore"] +NDTensorsCUDAExt = ["CUDA","GPUArraysCore"] +NDTensorsGPUArraysCoreExt = "GPUArraysCore" NDTensorsHDF5Ext = "HDF5" -NDTensorsMetalExt = "Metal" +NDTensorsMetalExt = ["GPUArraysCore", "Metal"] NDTensorsOctavianExt = "Octavian" NDTensorsTBLISExt = "TBLIS" NDTensorscuTENSORExt = "cuTENSOR" @@ -90,6 +91,7 @@ julia = "1.6" [extras] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" Metal = "dde4c033-4e86-420c-a63e-0dd931031962" Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4" diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/append.jl b/NDTensors/ext/NDTensorsAMDGPUExt/append.jl index e84d8b064d..c4b5d30947 100644 --- a/NDTensors/ext/NDTensorsAMDGPUExt/append.jl +++ b/NDTensors/ext/NDTensorsAMDGPUExt/append.jl @@ -1,5 +1,5 @@ -using GPUArraysCore: @allowscalar using AMDGPU: ROCArray +using GPUArraysCore: @allowscalar using NDTensors.Expose: Exposed, unexpose ## Warning this append function uses scalar indexing and is therefore extremely slow diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/indexing.jl b/NDTensors/ext/NDTensorsAMDGPUExt/indexing.jl index c0b9fc4afd..46ade03433 100644 --- a/NDTensors/ext/NDTensorsAMDGPUExt/indexing.jl +++ b/NDTensors/ext/NDTensorsAMDGPUExt/indexing.jl @@ -1,7 +1,7 @@ -using NDTensors.Expose: Exposed, expose, parent, unexpose -using NDTensors.GPUArraysCoreExtensions: cpu using AMDGPU: AMDGPU, ROCArray using GPUArraysCore: @allowscalar +using NDTensors.Expose: Exposed, expose, parent, unexpose +using NDTensors.GPUArraysCoreExtensions: cpu function Base.getindex(E::Exposed{<:ROCArray}) return @allowscalar unexpose(E)[] diff --git a/NDTensors/ext/NDTensorsCUDAExt/append.jl b/NDTensors/ext/NDTensorsCUDAExt/append.jl index 48470e5131..9a974354ab 100644 --- a/NDTensors/ext/NDTensorsCUDAExt/append.jl +++ b/NDTensors/ext/NDTensorsCUDAExt/append.jl @@ -1,5 +1,5 @@ -using GPUArraysCore: @allowscalar using CUDA: CuArray +using GPUArraysCore: @allowscalar using NDTensors.Expose: Exposed, unexpose ## Warning this append function uses scalar indexing and is therefore extremely slow diff --git a/NDTensors/ext/NDTensorsGPUArraysCoreExt/NDTensorsGPUArraysCoreExt.jl b/NDTensors/ext/NDTensorsGPUArraysCoreExt/NDTensorsGPUArraysCoreExt.jl new file mode 100644 index 0000000000..1ead107b45 --- /dev/null +++ b/NDTensors/ext/NDTensorsGPUArraysCoreExt/NDTensorsGPUArraysCoreExt.jl @@ -0,0 +1,3 @@ +module NDTensorsGPUArraysCoreExt +include("contract.jl") +end diff --git a/NDTensors/ext/NDTensorsGPUArraysCoreExt/contract.jl b/NDTensors/ext/NDTensorsGPUArraysCoreExt/contract.jl new file mode 100644 index 0000000000..8a6a18283a --- /dev/null +++ b/NDTensors/ext/NDTensorsGPUArraysCoreExt/contract.jl @@ -0,0 +1,48 @@ +using Adapt: adapt +using GPUArraysCore: AbstractGPUArray +using NDTensors: NDTensors, DenseTensor, DiagTensor, contract!, dense, inds, Tensor +using NDTensors.Expose: Exposed, expose, unexpose +using NDTensors.TypeParameterAccessors: parenttype, set_ndims + +## In this function we convert the DiagTensor to a dense tensor and +## Feed it back into contract +function NDTensors.contract!( + output_tensor::Exposed{<:AbstractGPUArray,<:DenseTensor}, + labelsoutput_tensor, + tensor1::Exposed{<:Number,<:DiagTensor}, + labelstensor1, + tensor2::Exposed{<:AbstractGPUArray,<:DenseTensor}, + labelstensor2, + α::Number=one(Bool), + β::Number=zero(Bool), +) + tensor1 = unexpose(tensor1) + ## convert tensor1 to a dense + ## TODO this allocates on CPU first then moves over to GPU which could be slow + tensor1 = adapt(set_ndims(parenttype(typeof(tensor2)), 1), dense(tensor1)) + return contract!( + output_tensor, + labelsoutput_tensor, + expose(tensor1), + labelstensor1, + tensor2, + labelstensor2, + α, + β, + ) +end + +function NDTensors.contract!( + output_tensor::Exposed{<:AbstractGPUArray,<:DenseTensor}, + labelsoutput_tensor, + tensor1::Exposed{<:AbstractGPUArray,<:DenseTensor}, + labelstensor1, + tensor2::Exposed{<:Number,<:DiagTensor}, + labelstensor2, + α::Number=one(Bool), + β::Number=zero(Bool), +) + return contract!( + output_tensor, labelsoutput_tensor, tensor2, labelstensor2, tensor1, labelstensor1, α, β + ) +end diff --git a/NDTensors/ext/NDTensorsMetalExt/permutedims.jl b/NDTensors/ext/NDTensorsMetalExt/permutedims.jl index 0480103564..5af55b8eb3 100644 --- a/NDTensors/ext/NDTensorsMetalExt/permutedims.jl +++ b/NDTensors/ext/NDTensorsMetalExt/permutedims.jl @@ -1,4 +1,5 @@ using Metal: MtlArray +using GPUArraysCore: @allowscalar using NDTensors.Expose: Exposed, expose, unexpose ## Theres an issue in metal that `ReshapedArray' wrapped arrays cannot be permuted using ## permutedims (failing in that Metal uses scalar indexing) diff --git a/NDTensors/src/blocksparse/diagblocksparse.jl b/NDTensors/src/blocksparse/diagblocksparse.jl index a68b7a769c..a8b46b17cb 100644 --- a/NDTensors/src/blocksparse/diagblocksparse.jl +++ b/NDTensors/src/blocksparse/diagblocksparse.jl @@ -666,7 +666,9 @@ function contract!( # Overwrite the block of R β = zero(ElR) end - contract!(Rblock, labelsR, T1block, labelsT1, T2block, labelsT2, α, β) + contract!( + expose(Rblock), labelsR, expose(T1block), labelsT1, expose(T2block), labelsT2, α, β + ) end return R end diff --git a/NDTensors/src/imports.jl b/NDTensors/src/imports.jl index d3e7bd506d..04ea3d8d6b 100644 --- a/NDTensors/src/imports.jl +++ b/NDTensors/src/imports.jl @@ -9,7 +9,6 @@ using Base.Threads using Compat using Dictionaries using Folds -using GPUArraysCore using InlineStrings using Random using LinearAlgebra diff --git a/NDTensors/src/lib/GPUArraysCoreExtensions/src/gpuarrayscore.jl b/NDTensors/src/lib/GPUArraysCoreExtensions/src/gpuarrayscore.jl index 5c7c858de1..20ccf11e1d 100644 --- a/NDTensors/src/lib/GPUArraysCoreExtensions/src/gpuarrayscore.jl +++ b/NDTensors/src/lib/GPUArraysCoreExtensions/src/gpuarrayscore.jl @@ -1,4 +1,3 @@ -using GPUArraysCore: AbstractGPUArray, @allowscalar using NDTensors.Expose: Exposed, unexpose using NDTensors.TypeParameterAccessors: TypeParameterAccessors, type_parameter, set_type_parameter diff --git a/NDTensors/test/test_diag.jl b/NDTensors/test/test_diag.jl index bb1890ab9b..f6390a733f 100644 --- a/NDTensors/test/test_diag.jl +++ b/NDTensors/test/test_diag.jl @@ -2,8 +2,10 @@ using NDTensors using Test: @testset, @test, @test_throws using GPUArraysCore: @allowscalar +using Adapt: adapt include("NDTensorsTestUtils/NDTensorsTestUtils.jl") using .NDTensorsTestUtils: devices_list, is_supported_eltype +using LinearAlgebra: dot @testset "DiagTensor basic functionality" begin @testset "test device: $dev" for dev in devices_list(copy(ARGS)), @@ -56,13 +58,19 @@ using .NDTensorsTestUtils: devices_list, is_supported_eltype @test @allowscalar S1 ≈ S2 end end -@testset "DiagTensor contractions" begin - t = tensor(Diag([1.0, 1.0, 1.0]), (3, 3)) - A = randomTensor(Dense, (3, 3)) +@testset "DiagTensor contractions" for dev in devices_list(copy(ARGS)) + ## TODO add more GPU tests + elt = (dev == NDTensors.mtl ? Float32 : Float64) + t = tensor(Diag(elt[1.0, 1.0, 1.0]), (3, 3)) + A = randomTensor(Dense{elt}, (3, 3)) @test contract(t, (1, -2), t, (-2, 3)) == t @test contract(A, (1, -2), t, (-2, 3)) == A @test contract(A, (-2, 1), t, (-2, 3)) == transpose(A) + + ## Testing sparse contractions on GPU + t = tensor(Diag(one(elt)), (3, 3)) + @test contract(t, (-1, -2), dev(A), (-1, -2))[] ≈ dot(t, A) rtol = sqrt(eps(elt)) end nothing end diff --git a/NDTensors/test/test_diagblocksparse.jl b/NDTensors/test/test_diagblocksparse.jl index 2aa0e98c24..19fd7449ef 100644 --- a/NDTensors/test/test_diagblocksparse.jl +++ b/NDTensors/test/test_diagblocksparse.jl @@ -1,16 +1,20 @@ @eval module $(gensym()) using Dictionaries: Dictionary +using GPUArraysCore: @allowscalar using NDTensors: NDTensors, Block, BlockSparseTensor, + Diag, DiagBlockSparse, Tensor, blockoffsets, contract, + dense, + inds, nzblocks using Random: randn! -using Test: @test, @test_throws, @testset +using Test: @test, @test_broken, @test_throws, @testset @testset "UniformDiagBlockSparseTensor basic functionality" begin NeverAlias = NDTensors.NeverAlias AllowAlias = NDTensors.AllowAlias @@ -48,4 +52,37 @@ end @test_throws ErrorException contract(a1, labels1, a2, labels2) end end + +include("NDTensorsTestUtils/NDTensorsTestUtils.jl") +using .NDTensorsTestUtils: devices_list +@testset "DiagBlockSparse contract" for dev in devices_list(copy(ARGS)) + elt = dev == NDTensors.mtl ? Float32 : Float64 + A = dev(BlockSparseTensor{elt}([(1, 1), (2, 2)], [2, 2], [2, 2])) + randn!(A) + t = Tensor(DiagBlockSparse(one(elt), blockoffsets(A)), inds(A)) + tdense = Tensor(Diag(one(elt)), inds(A)) + + a = dense(contract(A, (1, -2), t, (3, -2))) + b = contract(dense(A), (1, -2), tdense, (3, -2)) + @test @allowscalar a ≈ b + + a = dense(contract(A, (-2, 1), t, (-2, 3))) + b = contract(dense(A), (-2, 1), tdense, (-2, 3)) + @test @allowscalar a ≈ b + + a = contract(A, (-1, -2), t, (-1, -2))[] + b = contract(dense(A), (-1, -2), tdense, (-1, -2))[] + @test @allowscalar a ≈ b + + ## TODO fix these kinds of contractions + A = BlockSparseTensor{elt}([(1, 1), (2, 2)], [3, 2, 3], [2, 2]) + randn!(A) + t = Tensor(DiagBlockSparse(one(elt), blockoffsets(A)), inds(A)) + @test_broken dense(contract(A, (1, -2), (t), (3, -2))) ≈ + contract(dense(A), (1, -2), dense(t), (3, -2)) + @test_broken dense(contract(A, (-2, 1), t, (-2, 3))) ≈ + contract(dense(A), (-2, 1), dense(t), (-2, 3)) + @test_broken contract(dev(A), (-1, -2), dev(t), (-1, -2))[] ≈ + contract(dense(A), (-1, -2), dense(t), (-1, -2))[] +end end