Simplified spectral transform towards GPU version #575

milankl · 2024-09-12T16:15:58Z

using SpeedyWeather

function SpeedyTransforms.transform_simple!(  # SPECTRAL TO GRID
    grids::FullGaussianArray,                 # gridded output
    specs::LowerTriangularArray,              # spectral coefficients input
    S::SpectralTransform{NF};                 # precomputed transform
) where NF                                    # number format NF

    (; nlat, nlon_max, nlat_half) = S
    (; Λs, brfft_plans ) = S

    @boundscheck ismatching(S, grids) || throw(DimensionMismatch(S, grids))
    @boundscheck ismatching(S, specs) || throw(DimensionMismatch(S, grids))

    lmax = specs.m - 1            # 0-based maximum degree l of spherical harmonics
    mmax = specs.n - 1            # 0-based maximum order m of spherical harmonics

    # preallocate work arrays
    nfreq_max = nlon_max÷2 + 1
    gn = zeros(Complex{NF}, nfreq_max)      # phase factors for northern latitudes
    gs = zeros(Complex{NF}, nfreq_max)      # phase factors for southern latitudes

    rings = eachring(grids)                 # precomputed ring indices

    for k in eachgrid(grids)                # loop over all grids (e.g. vertical dimension)
        for j_north in 1:nlat_half              # symmetry: loop over northern latitudes only
            j_south = nlat - j_north + 1        # southern latitude index
            Λj = Λs[j_north]                    # Use precomputed Legendre polynomials Λ

            # INVERSE LEGENDRE TRANSFORM by looping over wavenumbers l, m
            lm = 1                              # flattened index for non-zero l, m indices
            for m in 1:mmax+1                   # loop over orders, Σ_{m=0}^{mmax}, but 1-based index
                acc_odd  = zero(Complex{NF})    # accumulator for isodd(l+m)
                acc_even = zero(Complex{NF})    # accumulator for iseven(l+m)

                for l in m:lmax+1               # loop over degrees, Σ_{l=m}^{lmax} but 1-based index
                    # anti-symmetry: sign change of odd harmonics on southern hemisphere
                    if iseven(l+m)
                        acc_even += specs[lm, k] * Λj[lm]
                    else
                        acc_odd += specs[lm, k] * Λj[lm]
                    end

                    lm += 1                     # count up flattened index for next harmonic
                end

                gn[m] += (acc_even + acc_odd)   # accumulators for northern
                gs[m] += (acc_even - acc_odd)   # and southern hemisphere (negative odd harmonics)
            end

            # INVERSE FOURIER TRANSFORM in zonal direction
            brfft_plan = brfft_plans[j_north]   # FFT planned wrt nlon on ring
            ilons = rings[j_north]              # in-ring indices northern ring
            grids[ilons, k] = brfft_plan*gn     # perform FFT on northern latitude

            # southern latitude
            ilons = rings[j_south]              # in-ring indices southern ring
            grids[ilons, k] = brfft_plan*gs     # perform FFT on southern latitude

            fill!(gn, 0)                        # set phase factors back to zero
            fill!(gs, 0)
        end
    end

    return grids
end

Results are identical (not necessarily bitwise though) to transform! but it's only defined for FullGaussianArray. It skips some steps for performance, some for generality with other grids and also the Legendre polynomials need to be precomputed in the SpectralTransform. Its performance is within 2x of transform! though. To be used like

grids = zeros(FullGaussianGrid, 24, 8)   # allocate output: 48-ring, 8-layer FullGaussianArray
specs = transform(grids)                 # allocate input: T31 (32x32) 8-layer LowerTriangularArray (all zeros)

Now let's transform the first 8 zonal harmonics l=0:7, m=0 in the 8 layers

for k in 1:8
    specs[k, k] = 1
end

julia> specs
528×8 LowerTriangularArray{ComplexF64, 2, Matrix{ComplexF64}}:
 1.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im
 0.0+0.0im  1.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im
 0.0+0.0im  0.0+0.0im  1.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im
 0.0+0.0im  0.0+0.0im  0.0+0.0im  1.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im
 0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  1.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im
 0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  1.0+0.0im  0.0+0.0im  0.0+0.0im
 0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  1.0+0.0im  0.0+0.0im
 0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  1.0+0.0im
 0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im  0.0+0.0im
    ⋮                                                      ⋮

Precompute a SpectralTransform (mostly the Legendre polynomials)

S = SpectralTransform(grids, recompute_legendre=false)

and then do

SpeedyTransforms.transform_simple!(grids, specs, S)

(with or without the SpeedyTransforms. depending on the scope you define that function in).
You can check the result with (e.g. the l=3,4 modes with index k=4,5)

The text was updated successfully, but these errors were encountered:

milankl · 2024-09-12T16:19:10Z

@jackleland In comparison, much simpler is the leapfrog time integration. leapfrog! takes 3x LowerTriangularArray in, plus two scalars and a struct that only contains scalars, otherwise embarrasingly parallel over all (degrees, orders = horizontal and vertical dimensions of the LowerTriangularArrays) respective elements lm (complex though!) of the array

SpeedyWeather.jl/src/dynamics/time_integration.jl

Lines 153 to 159 in 2b67b22

    
           @inbounds for lm in eachindex(A_old, A_new, tendency) 
        
               a_old = A_old[lm]                       # double filtered value from previous time step (t-Δt) 
        
               a_new = a_old + dt_NF*tendency[lm]      # Leapfrog/Euler step depending on dt=Δt, 2Δt (unfiltered at t+Δt) 
        
               a_update = a_old - 2A_lf[lm] + a_new    # Eq. 8&9 in Williams (2009), calculate only once 
        
               A_old[lm] = A_lf[lm] + w1*a_update      # Robert's filter: A_old[lm] becomes 2xfiltered value at t 
        
               A_new[lm] = a_new - w2*a_update         # Williams filter: A_new[lm] becomes 1xfiltered value at t+Δt 
        
           end

milankl · 2024-09-16T16:32:45Z

@jackleland Also notice how the transform interlaces the (inverse) fourier transforms (FT) and Legendre transform (LT) so that we essentially have in the horizontal

for j in each latitude ring
    LT
    FT
end

resulting in LT, FT, next ring, LT, FT, next ring, .... I deliberately wrote it that way (taken from Justin Willmert's implementation) to save memory: The "phase factors" gn, gs are the scratch memory to store the Legendre but not yet Fourier transformed data but they are only vectors.

In contrast, one can also do (given it's a linear operation)

for j in each latitude ring
    LT
end

for j in each latitude ring
    FT
end

meaning do first all the Legendre transforms (but store the result somewhere, which is a matrix, and so much larger than the gn, gs vectors) and then all the Fourier transforms. I guess in the end it's a memory-compute trade-off but depending on the implementation/CPU vs GPU either approach can be better. Just to provide some context.

milankl · 2024-09-16T16:42:13Z

For reference this is ECMWF's transform library https://github.com/ecmwf-ifs/ectrans and maybe @samhatfield can give us some insights of what choices they have made regarding GPU implementations? Interlaced or not, Float32 everything, CuFFT?, write Legendre transform as matrix-vector multiplication? Or does it all change massively because of your MPI parallelisation?

samhatfield · 2024-09-17T10:56:44Z

Our situation is naturally made massively more complicated by the fact that our fields are distributed across compute tasks. However we can wave this away by looking at what ecTrans does when you run on a single task.

We do all of the FFTs for each field (here "field" means a 2D slice of the atmosphere; different "fields" could be different meteorological variables, or different vertical levels of the same variable; there's no difference in the code). Then we do all the Legendre transforms. So there's no interleaving.

For the FFT, we loop over latitudes and plan one FFT for each latitude. This is a batched FFT with "fields" as the batching dimension. For this we use cufftPlanMany.

Once that's done we do the Legendre transform through a loop over zonal wavenumbers. For each wavenumber we execute a matrix-matrix multiplication. The "free" dimension of the multiplication is "fields" so again there's a kind of batching. For this we use cublasSgemm.

I'm not sure that ecTrans is the most useful reference for you, given the huge complication of the distributed-memory support. But you can take the general lesson of giving the GPU as much data to work on as possible in a single invocation (of cufftExec or whatever. They like batching :)

milankl · 2024-09-17T14:03:40Z

Awesome thanks Sam! Does either FFT or LT work in-place? Or what do you do with the Fourier but not Legendre-transformed field? Because that's another concern of mine that one would need to store this somewhere. It wouldn't be too bad to have a scratch matrix inside the SpectralTransform struct that does that, but I was worried about the allocations if you don't!

samhatfield · 2024-09-19T15:16:04Z

The FFT is in place, but the LT isn't. But again, we have looooads of temporary arrays (we sort of have no choice). So again, if you rule out distributed memory parallelism, you can probably combine a lot of them to reduce memory consumption.

milankl · 2024-09-23T13:34:06Z

Sam, what are the reasons for writing the LT as matrix-vector multiplication instead of looping over the wavenumbers making use of the odd/even symmetries on the hemispheres? (What we do here) I see that once you have precomputed the polynomials in lower triangular matrices than you could also take the next step to write those loops as one matrix-vector multiplication per latitude but those matrices will be N squared in size if the polynomials are N in size if I'm not wrong? Or is that just a memory-speed tradeoff and it pays off for you if you just make it a big matmul operation?

samhatfield · 2024-09-27T09:31:34Z

We do loop over zonal wavenumbers, performing two GEMMs for each wavenumber - one symmetric and one antisymmetric.

I appreciate ecTrans is a gargantuan monster, so what might be useful is for me to write some Julia-y pseudo code which roughly mimics what we do in ecTrans...

milankl · 2024-09-27T10:43:46Z

Ah okay so essentially the inner-most loop from above

                for l in m:lmax+1               # loop over degrees, Σ_{l=m}^{lmax} but 1-based index
                    # anti-symmetry: sign change of odd harmonics on southern hemisphere
                    if iseven(l+m)
                        acc_even += specs[lm, k] * Λj[lm]
                    else
                        acc_odd += specs[lm, k] * Λj[lm]
                    end

                    lm += 1                     # count up flattened index for next harmonic
                end

is what you represent as a matmul, okay yeah that makes more sense than to also include the loop over zonal wavenumbers m... If you want to replicate in Julia roughly what you're doing feel free to take the ~50 lines from above and tweak them!

samhatfield · 2024-10-01T13:13:04Z

I tried but it is so damn complicated trying to untangle all of the stuff which isn't relevant to your case. ecTrans can calculate grid point derivatives of fields "on the fly" which means there's all kinds of extra array spacing and indices you have to carry around, and several packing and unpacking steps. It also distinguishes between "wind-like" and scalar fields.

Instead I think we can step closer to an ecTrans-like inverse bit by bit. The below loop should be bit-identical to your reference version above, would you agree?

for k in eachgrid(grids)                # loop over all grids (e.g. vertical dimension)
    for j_north in 1:nlat_half              # symmetry: loop over northern latitudes only
        j_south = nlat - j_north + 1        # southern latitude index
        Λj = Λs[j_north]                    # Use precomputed Legendre polynomials Λ

        # INVERSE LEGENDRE TRANSFORM by looping over wavenumbers l, m
        lm = 1                              # flattened index for non-zero l, m indices
        for m in 1:mmax+1                   # loop over orders, Σ_{m=0}^{mmax}, but 1-based index
            acc_odd  = zero(Complex{NF})    # accumulator for isodd(l+m)
            acc_even = zero(Complex{NF})    # accumulator for iseven(l+m)

            # even transform
            for l in m:2:lmax+1
                acc_even += specs[lm+l-1, k] * Λj[lm+l-1]
            end

            # odd transform
            for l in m+m%2:2:lmax+1
                acc_odd += specs[lm+l-1, k] * Λj[lm+l-1]
            end

            gn[m] += (acc_even + acc_odd)   # accumulators for northern
            gs[m] += (acc_even - acc_odd)   # and southern hemisphere (negative odd harmonics)

            lm += lmax - m + 2 # jump flattened index up to start of next zonal wavenumber
        end

        # INVERSE FOURIER TRANSFORM in zonal direction
        brfft_plan = brfft_plans[j_north]   # FFT planned wrt nlon on ring
        ilons = rings[j_north]              # in-ring indices northern ring
        grids[ilons, k] = brfft_plan*gn     # perform FFT on northern latitude

        # southern latitude
        ilons = rings[j_south]              # in-ring indices southern ring
        grids[ilons, k] = brfft_plan*gs     # perform FFT on southern latitude

        fill!(gn, 0)                        # set phase factors back to zero
        fill!(gs, 0)
    end
end

Now an opportunity for a dot product arises:

for k in eachgrid(grids)                # loop over all grids (e.g. vertical dimension)
    for j_north in 1:nlat_half              # symmetry: loop over northern latitudes only
        j_south = nlat - j_north + 1        # southern latitude index
        Λj = Λs[j_north]                    # Use precomputed Legendre polynomials Λ

        # INVERSE LEGENDRE TRANSFORM by looping over wavenumbers l, m
        lm = 1                              # flattened index for non-zero l, m indices
        for m in 1:mmax+1                   # loop over orders, Σ_{m=0}^{mmax}, but 1-based index
            acc_even = specs[lm+m-1:2:lm+lmax, k]     ⋅ Λj[lm+m-1:2:lm+lmax]     # even transform
            acc_odd  = specs[lm+m+m%2-1:2:lm+lmax, k] ⋅ Λj[lm+m+m%2-1:2:lm+lmax] # odd transform

            gn[m] += (acc_even + acc_odd)   # accumulators for northern
            gs[m] += (acc_even - acc_odd)   # and southern hemisphere (negative odd harmonics)

            lm += lmax - m + 2 # jump flattened index up to start of next zonal wavenumber
        end

        # INVERSE FOURIER TRANSFORM in zonal direction
        brfft_plan = brfft_plans[j_north]   # FFT planned wrt nlon on ring
        ilons = rings[j_north]              # in-ring indices northern ring
        grids[ilons, k] = brfft_plan*gn     # perform FFT on northern latitude

        # southern latitude
        ilons = rings[j_south]              # in-ring indices southern ring
        grids[ilons, k] = brfft_plan*gs     # perform FFT on southern latitude

        fill!(gn, 0)                        # set phase factors back to zero
        fill!(gs, 0)
    end
end

The above may not be strictly correct but you get the idea.

Now you can see how the outermost loop can be removed. If a k dimension is added to acc_even, acc_odd, and gn, there is an opportunity for a mat-vec product.

This is still not what ecTrans does, but certainly closer, and has given you a bit more scope for using an external library routine for performing the mat-vec multiply.

Intuitively I would think traditional accelerator offloading wouldn't be worthwhile for you, given the problem sizes you deal with and the overheads of host<->accelerator transfers. On the other hand, the latest and future generations of GPU "superchips" will have unified memory between host and accelerator, removing the need for explicit transfers. You may want to develop your GPU version of SpeedyWeather with one of these devices in mind. Hopefully you can get access to a GH200 for development :p

samhatfield · 2024-10-01T13:17:33Z

Actually, a traditional separate memory host-accelerator programming model could work for you, provided you perform most or all of the kernels on the device. We are still far from having significant chunks of the IFS on GPUs, even if the GPU-enabled spectral transform is quite mature and fast.

maximilian-gelbrecht · 2024-10-11T12:43:12Z

I spent some thought on the differentiability of the transform.

Long story, short: don't worry about it at this point, as far as I can see it.

In principle we'd like to define the reverse mode rules with ChainRules and EnzymeRules for the transforms manually. Similar to how that's already done for Fourier transforms itself. To do that, we'd ideally use a slightly modified version of the inverse plan for its pullback/adjoint. For that we need some flexibility with regards to the normalisation and quadrature weights in the code, but this is the kind of stuff that we can also worry about afterwards and doesn't need a fundamentally different design of the transform.

milankl added gpu 🖼️ Everthing GPU related transform ⬅️ ➡️ Our spherical harmonic transform, grid to spectral, spectral to grid labels Sep 12, 2024

milankl assigned jackleland Sep 12, 2024

milankl mentioned this issue Sep 12, 2024

Example GPU kernels #121

Closed

jackleland linked a pull request Oct 31, 2024 that will close this issue

Implementation of CUDA-ised transform #602

Draft

milankl linked a pull request Dec 3, 2024 that will close this issue

Implementation of CUDA-ised transform #602

Draft

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Simplified spectral transform towards GPU version #575

Simplified spectral transform towards GPU version #575

milankl commented Sep 12, 2024

milankl commented Sep 12, 2024

milankl commented Sep 16, 2024 •

edited

Loading

milankl commented Sep 16, 2024

samhatfield commented Sep 17, 2024

milankl commented Sep 17, 2024

samhatfield commented Sep 19, 2024

milankl commented Sep 23, 2024

samhatfield commented Sep 27, 2024

milankl commented Sep 27, 2024

samhatfield commented Oct 1, 2024

samhatfield commented Oct 1, 2024

maximilian-gelbrecht commented Oct 11, 2024 •

edited

Loading

Simplified spectral transform towards GPU version #575

Simplified spectral transform towards GPU version #575

Comments

milankl commented Sep 12, 2024

milankl commented Sep 12, 2024

milankl commented Sep 16, 2024 • edited Loading

milankl commented Sep 16, 2024

samhatfield commented Sep 17, 2024

milankl commented Sep 17, 2024

samhatfield commented Sep 19, 2024

milankl commented Sep 23, 2024

samhatfield commented Sep 27, 2024

milankl commented Sep 27, 2024

samhatfield commented Oct 1, 2024

samhatfield commented Oct 1, 2024

maximilian-gelbrecht commented Oct 11, 2024 • edited Loading

milankl commented Sep 16, 2024 •

edited

Loading

maximilian-gelbrecht commented Oct 11, 2024 •

edited

Loading