Skip to content

Commit

Permalink
Type 3 working
Browse files Browse the repository at this point in the history
  • Loading branch information
DiamonDinoia committed Aug 16, 2024
1 parent 53a7c63 commit 9f517e3
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 44 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ set(FINUFFT_CUDA_ARCHITECTURES "native" CACHE STRING "CUDA architectures to buil
# if FINUFFT_USE_CPU is OFF, the following options are ignored
set(FINUFFT_ARCH_FLAGS "native" CACHE STRING "Compiler flags for specifying target architecture, defaults to -march=native")
# sphinx tag (don't remove): @cmake_opts_end
cmake_dependent_option(FINUFFT_ENABLE_INSTALL "Disable installation in the case of python builds" OFF "FINUFFT_BUILD_PYTHON" OFF)
cmake_dependent_option(FINUFFT_ENABLE_INSTALL "Disable installation in the case of python builds" ON "NOT FINUFFT_BUILD_PYTHON" OFF)
cmake_dependent_option(FINUFFT_STATIC_LINKING "Disable static libraries in the case of python builds" ON "NOT FINUFFT_BUILD_PYTHON" OFF)
cmake_dependent_option(FINUFFT_SHARED_LINKING "Shared should be the opposite of static linking" ON "NOT FINUFFT_STATIC_LINKING" OFF)
# cmake-format: on
Expand Down
6 changes: 3 additions & 3 deletions include/cufinufft/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -713,8 +713,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N,
d_plan->deconv, d_plan->deconv,
[c1, c2, c3, d1, d2, d3, imasign] __host__ __device__(
const thrust::tuple<T, T, T> tuple, cuda_complex<T> deconv)
-> cuda_complex<T> {
const thrust::tuple<T, T, T> tuple,
cuda_complex<T> deconv) -> cuda_complex<T> {
// d2 and d3 are 0 if dim < 2 and dim < 3
const auto phase = c1 * (thrust::get<0>(tuple) + d1) +
c2 * (thrust::get<1>(tuple) + d2) +
Expand Down Expand Up @@ -747,7 +747,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
int t2modes[] = {d_plan->nf1, d_plan->nf2, d_plan->nf3};
cufinufft_opts t2opts = d_plan->opts;
t2opts.modeord = 0;
t2opts.debug = std::max(0, t2opts.debug - 1);
t2opts.debug = std::max(0, t2opts.debug);
t2opts.gpu_spreadinterponly = 0;
// Safe to ignore the return value here?
if (d_plan->t2_plan) cufinufft_destroy_impl(d_plan->t2_plan);
Expand Down
17 changes: 10 additions & 7 deletions src/cuda/3d/cufinufft3d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -135,36 +135,39 @@ int cufinufft3d3_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
int ier;
cuda_complex<T> *d_cstart;
cuda_complex<T> *d_fkstart;
cuda_complex<T> *d_cbatch_start;
const auto stream = d_plan->stream;
for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->N;
d_cbatch_start = d_plan->c_batch + i * d_plan->maxbatchsize * d_plan->M;
d_plan->c = d_cbatch_start;
d_plan->fk = d_plan->fw;
// setting input for spreader
d_plan->c = d_plan->c_batch + i * d_plan->maxbatchsize * d_plan->M;
// setting output for spreader
d_plan->fk = d_plan->fw;
// NOTE: fw might need to be set to 0
// Step 0: pre-phase the input strengths
for (int i = 0; i < blksize; i++) {
thrust::transform(thrust::cuda::par.on(stream), d_plan->prephase,
d_plan->prephase + d_plan->M, d_cstart + i * d_plan->M,
d_plan->c_batch + i * d_plan->M,
thrust::multiplies<cuda_complex<T>>());
d_plan->c + i * d_plan->M, thrust::multiplies<cuda_complex<T>>());
}
// Step 1: Spread
if ((ier = cuspread3d<T>(d_plan, blksize))) return ier;
// now d_plan->fk = d_plan->fw contains the spread values
// Step 2: Type 3 NUFFT
// type 2 goes from fk to c
// saving the results directly in the user output array d_fk
// it needs to do blksize transforms
d_plan->t2_plan->ntransf = blksize;
if ((ier = cufinufft3d2_exec<T>(d_fkstart, d_plan->fw, d_plan->t2_plan))) return ier;
// Step 3: deconvolve
// now we need to d_fk = d_fk*d_plan->deconv
for (int i = 0; i < blksize; i++) {
thrust::transform(thrust::cuda::par.on(stream), d_plan->deconv,
d_plan->deconv + d_plan->N, d_fkstart + i * d_plan->N,
d_fkstart + i * d_plan->N, thrust::multiplies<cuda_complex<T>>());
}
}

return 0;
}

Expand Down
16 changes: 8 additions & 8 deletions test/cuda/cufinufft_setpts.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#include <thrust/host_vector.h>

// for now, once finufft is demacroized we can test float
using T = double;
using test_t = double;

template<typename T, typename V> bool equal(V *d_vec, T *cpu, const std::size_t size) {
// copy d_vec to cpu
Expand Down Expand Up @@ -98,15 +98,15 @@ int main() {
const int N = n_modes[0] * n_modes[1] * n_modes[2];
const int M = 100;

thrust::host_vector<T> x(M * ntransf), y(M * ntransf), z(M * ntransf), s(N * ntransf),
t(N * ntransf), u(N * ntransf);
thrust::host_vector<thrust::complex<T>> c(M * ntransf), fk(N * ntransf);
thrust::host_vector<test_t> x(M * ntransf), y(M * ntransf), z(M * ntransf),
s(N * ntransf), t(N * ntransf), u(N * ntransf);
thrust::host_vector<thrust::complex<test_t>> c(M * ntransf), fk(N * ntransf);

thrust::device_vector<T> d_x{}, d_y{}, d_z{}, d_s{}, d_t{}, d_u{};
thrust::device_vector<thrust::complex<T>> d_c(M * ntransf), d_fk(N * ntransf);
thrust::device_vector<test_t> d_x{}, d_y{}, d_z{}, d_s{}, d_t{}, d_u{};
thrust::device_vector<thrust::complex<test_t>> d_c(M * ntransf), d_fk(N * ntransf);

std::default_random_engine eng(42);
std::uniform_real_distribution<T> dist11(-1, 1);
std::uniform_real_distribution<test_t> dist11(-1, 1);
auto rand_util_11 = [&eng, &dist11]() {
return dist11(eng);
};
Expand Down Expand Up @@ -237,7 +237,7 @@ int main() {
};
// testing correctness of the plan creation
// cufinufft_plan_t<float> *single_plan{nullptr};
cufinufft_plan_t<T> *double_plan{nullptr};
cufinufft_plan_t<test_t> *double_plan{nullptr};
// test_type1(double_plan);
// test_type2(double_plan);
test_type3(double_plan);
Expand Down
61 changes: 36 additions & 25 deletions test/cuda/cufinufft_type3_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

// for now, once finufft is demacroized we can test float
using T = double;

template<typename T, typename V> bool equal(V *d_vec, T *cpu, const std::size_t size) {
// copy d_vec to cpu
thrust::host_vector<T> h_vec(size);
Expand Down Expand Up @@ -75,10 +72,10 @@ auto almost_equal(V *d_vec,
assert(cudaMemcpy(h_vec.data(), d_vec, size * sizeof(T), cudaMemcpyDeviceToHost) ==
cudaSuccess);
// print h_vec and cpu
// for (std::size_t i = 0; i < size; ++i) {
// std::cout << "gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i]
// << '\n';
// }
for (std::size_t i = 0; i < size; ++i) {
std::cout << "gpu[" << i << "]: " << h_vec[i] << " cpu[" << i << "]: " << cpu[i]
<< '\n';
}
std::cout << "relerrtwonorm: " << infnorm(h_vec.data(), cpu, size) << std::endl;
// compare the l2 norm of the difference between the two vectors
if (relerrtwonorm(h_vec.data(), cpu, size) < tol) {
Expand All @@ -88,32 +85,39 @@ auto almost_equal(V *d_vec,
}

int main() {
// for now, once finufft is demacroized we can test float
using test_t = double;

// defaults. tests should shadow them to override
cufinufft_opts opts;
cufinufft_default_opts(&opts);
opts.debug = 2;
opts.debug = 2;
opts.upsampfac = 1.25;
opts.gpu_kerevalmeth = 1;
// opts.gpu_sort = 0;
finufft_opts fin_opts;
finufft_default_opts(&fin_opts);
fin_opts.debug = 2;
fin_opts.spread_kerevalmeth = 1;
fin_opts.upsampfac = 1.25;
const int iflag = 1;
const int ntransf = 1;
const int dim = 3;
const double tol = 1e-9;
const int N = 1023;
const int n_modes[] = {10, 5, 3};
const int N = n_modes[0] * n_modes[1] * n_modes[2];
const int M = 1000;
const double bandwidth = 50.0;

thrust::host_vector<T> x(M * ntransf), y(M * ntransf), z(M * ntransf), s(N * ntransf),
t(N * ntransf), u(N * ntransf);
thrust::host_vector<std::complex<T>> c(M * ntransf), fk(N * ntransf);
thrust::host_vector<test_t> x(M * ntransf), y(M * ntransf), z(M * ntransf),
s(N * ntransf), t(N * ntransf), u(N * ntransf);
thrust::host_vector<std::complex<test_t>> c(M * ntransf), fk(N * ntransf);

thrust::device_vector<T> d_x{}, d_y{}, d_z{}, d_s{}, d_t{}, d_u{};
thrust::device_vector<std::complex<T>> d_c(M * ntransf), d_fk(N * ntransf);
thrust::device_vector<test_t> d_x{}, d_y{}, d_z{}, d_s{}, d_t{}, d_u{};
thrust::device_vector<std::complex<test_t>> d_c(M * ntransf), d_fk(N * ntransf);

std::default_random_engine eng(42);
std::uniform_real_distribution<T> dist11(-1, 1);
std::uniform_real_distribution<test_t> dist11(-1, 1);
auto rand_util_11 = [&eng, &dist11]() {
return dist11(eng);
};
Expand Down Expand Up @@ -161,11 +165,12 @@ int main() {
cudaDeviceSynchronize();

const auto cpu_planer =
[iflag, tol, ntransf, dim, M, N, &x, &y, &z, &s, &t, &u, &fin_opts](
[iflag, tol, ntransf, dim, M, N, n_modes, &x, &y, &z, &s, &t, &u, &fin_opts](
const auto type) {
finufft_plan_s *plan{nullptr};
assert(finufft_makeplan(
type, dim, nullptr, iflag, ntransf, tol, &plan, &fin_opts) == 0);
std::int64_t nl[] = {n_modes[0], n_modes[1], n_modes[2]};
assert(
finufft_makeplan(type, dim, nl, iflag, ntransf, tol, &plan, &fin_opts) == 0);
assert(finufft_setpts(plan, M, x.data(), y.data(), z.data(), N, s.data(),
t.data(), u.data()) == 0);
return plan;
Expand Down Expand Up @@ -204,6 +209,7 @@ int main() {
deconv_tol,
M,
N,
n_modes,
&d_x,
&d_y,
&d_z,
Expand All @@ -219,8 +225,8 @@ int main() {
using T = typename std::remove_pointer<decltype(plan)>::type::real_t;
const int type = 3;
const auto cpu_plan = cpu_planer(type);
assert(cufinufft_makeplan_impl<T>(type, dim, nullptr, iflag, ntransf, T(tol), &plan,
&opts) == 0);
assert(cufinufft_makeplan_impl<T>(type, dim, (int *)n_modes, iflag, ntransf, T(tol),
&plan, &opts) == 0);
assert(cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(),
d_z.data().get(), N, d_s.data().get(),
d_t.data().get(), d_u.data().get(), plan) == 0);
Expand All @@ -245,6 +251,11 @@ int main() {
assert(equal(plan->kz, cpu_plan->Z, M));
assert(equal(plan->d_s, cpu_plan->Sp, N));
assert(equal(plan->d_t, cpu_plan->Tp, N));
assert(plan->spopts.nspread == cpu_plan->spopts.nspread);
assert(plan->spopts.upsampfac == cpu_plan->spopts.upsampfac);
assert(plan->spopts.ES_beta == cpu_plan->spopts.ES_beta);
assert(plan->spopts.ES_halfwidth == cpu_plan->spopts.ES_halfwidth);
assert(plan->spopts.ES_c == cpu_plan->spopts.ES_c);
assert(equal(plan->d_u, cpu_plan->Up, N));
// NOTE:seems with infnorm we are getting at most 11 digits of precision
std::cout << "prephase :\n";
Expand All @@ -258,10 +269,10 @@ int main() {
c[i].imag(randm11());
}
d_c = c;
for (int i = 0; i < N; i++) {
fk[i] = {-100, -100};
}
d_fk = fk;
// for (int i = 0; i < N; i++) {
// fk[i] = {randm11(), randm11()};
// }
// d_fk = fk;
cufinufft_execute_impl(
(cuda_complex<T> *)d_c.data().get(), (cuda_complex<T> *)d_fk.data().get(), plan);
finufft_execute(cpu_plan, (std::complex<T> *)c.data(), (std::complex<T> *)fk.data());
Expand All @@ -273,7 +284,7 @@ int main() {
};
// testing correctness of the plan creation
// cufinufft_plan_t<float> *single_plan{nullptr};
cufinufft_plan_t<T> *double_plan{nullptr};
cufinufft_plan_t<test_t> *double_plan{nullptr};
// test_type1(double_plan);
// test_type2(double_plan);
test_type3(double_plan);
Expand Down

0 comments on commit 9f517e3

Please sign in to comment.