diff --git a/.github/workflows/core_tests.yml b/.github/workflows/core_tests.yml new file mode 100644 index 000000000..3e8e62176 --- /dev/null +++ b/.github/workflows/core_tests.yml @@ -0,0 +1,26 @@ +name: Core Tests. + +on: + push: + branches: [master] + pull_request: + branches: [master] + +permissions: + contents: read + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.8.12 + uses: actions/setup-python@v4 + with: + python-version: "3.8.12" + - name: Install dependencies + run: | + pip install isort==5.10.1 black[jupyter]==22.3.0 + - name: Run Black Format Check + run: black . diff_rast/ tests/ examples/ --check diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml new file mode 100644 index 000000000..af384ce8f --- /dev/null +++ b/.github/workflows/doc.yml @@ -0,0 +1,35 @@ +name: Docs +on: + push: + branches: [main] + pull_request: + branches: [main] + workflow_dispatch: + +permissions: + contents: write +jobs: + docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + with: + python-version: '3.9' + - name: Install dependencies + run: | + pip install -r docs/requirements.txt + pip install torch==2.0.0 --index-url https://download.pytorch.org/whl/cpu + BUILD_NO_CUDA=1 pip install . + - name: Sphinx build + # fail on warnings: "-W --keep-going" + run: | + sphinx-build docs/source _build + - name: Deploy + uses: peaceiris/actions-gh-pages@v3 + with: + publish_branch: gh-pages + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: _build/ + force_orphan: true + # cname: ... \ No newline at end of file diff --git a/.gitignore b/.gitignore index ddc3d2fde..548d2ddf4 100644 --- a/.gitignore +++ b/.gitignore @@ -117,12 +117,6 @@ venv.bak/ # line_profiler *.lprof -# vscode -.vsocde - *build compile_commands.json - -*.png -*.txt -*.dump +*.dump \ No newline at end of file diff --git a/.gitmodules b/.gitmodules index d3f25e6ad..c62722305 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "csrc/third_party/glm"] - path = csrc/third_party/glm - url = https://github.com/g-truc/glm.git +[submodule "diff_rast/cuda/csrc/third_party/glm"] + path = diff_rast/cuda/csrc/third_party/glm + url = https://github.com/g-truc/glm.git \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 000000000..97db421e3 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +recursive-include dir-pattern diff_rast/cuda/csrc \ No newline at end of file diff --git a/README.md b/README.md index 459e27548..b9f9efcd2 100644 --- a/README.md +++ b/README.md @@ -2,18 +2,25 @@ Our version of differentiable gaussian rasterizer -# ref_rast +# Installation -Copied official version of differentiable gaussian rasterizer +For CUDA development, it is recommend to install with `BUILD_NO_CUDA=1`, which +will disable compiling during pip install, and instead use JIT compiling on your +first run. The benefit of JIT compiling is that it does incremental compiling as +you modify your cuda code so it is much faster than re-compile through pip. Note +the JIT compiled library can be found under `~/.cache/torch_extensions/py*-cu*/`. -# Installation +``` +BUILD_NO_CUDA=1 pip install -e .[dev] +``` + +If you won't touch the underlying CUDA code, you can just install with compiling: ``` -python3 -m pip install --upgrade pip -cd diff_rast; pip install -e . -cd ../ref_rast; pip install -e . +pip install -e .[dev] ``` + # Brief walkthrough The main python bindings for rasterization are found by importing diff_rast @@ -23,13 +30,6 @@ import diff_rast help(diff_rast) ``` -Additional supported cuda functions are found by importing cuda_lib - -``` -import cuda_lib -help(cuda_lib) -``` - # clangd setup (for Neovim) [clangd](https://clangd.llvm.org/) is a nice tool for providing completions, diff --git a/csrc/check_serial_backward.cu b/csrc/check_serial_backward.cu deleted file mode 100644 index e391ccf25..000000000 --- a/csrc/check_serial_backward.cu +++ /dev/null @@ -1,285 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include "backward.cuh" -#include "forward.cuh" -#include "helpers.cuh" -#include "serial_backward.cuh" - -#include "debug_utils.h" - -float random_float() { return (float)std::rand() / RAND_MAX; } - -float4 random_quat() { - float u = random_float(); - float v = random_float(); - float w = random_float(); - return { - sqrt(1.f - u) * sin(2.f * (float)M_PI * v), - sqrt(1.f - u) * cos(2.f * (float)M_PI * v), - sqrt(u) * sin(2.f * (float)M_PI * w), - sqrt(u) * cos(2.f * (float)M_PI * w)}; -} - -float3 compute_conic(const float3 cov2d) { - float det = cov2d.x * cov2d.z - cov2d.y * cov2d.y; - if (det == 0.f) - return {0.f, 0.f, 0.f}; - float inv_det = 1.f / det; - float3 conic; - conic.x = cov2d.z * inv_det; - conic.y = -cov2d.y * inv_det; - conic.z = cov2d.x * inv_det; - return conic; -} - -void compare_project2d_mean_backward(int check_index) { - float3 mean = {random_float(), random_float(), random_float()}; - // clang-format off - float proj[] = { - 1.f, 0.f, 0.f, 0.f, - 0.f, 1.f, 0.f, 0.f, - 0.f, 0.f, 1.f, 0.f, - 0.f, 0.f, 0.f, 1.f - }; - // clang-format on - float2 dL_dmean2d = {random_float(), random_float()}; - - float3 dL_dmean = project_pix_vjp(proj, mean, (dim3){2, 2, 1}, dL_dmean2d); - float3 dL_dmean_ref = projectMean2DBackward(mean, proj, dL_dmean2d); - - // comparison - const std::vector> dmean_data{ - {dL_dmean.x, dL_dmean_ref.x}, - {dL_dmean.y, dL_dmean_ref.y}, - {dL_dmean.z, dL_dmean_ref.z}, - }; - print_errors(dmean_data, "dmean (project2d)", check_index); -} - -void compare_conic_backward(int check_index) { - float3 cov2d = {random_float(), random_float(), random_float()}; - float3 conic = compute_conic(cov2d); - float3 dL_dconic = {random_float(), random_float(), random_float()}; - float3 dL_dcov2d; - cov2d_to_conic_vjp(conic, dL_dconic, dL_dcov2d); - float3 dL_dcov2d_ref = computeConicBackward(cov2d, dL_dconic); - - // comparison - const std::vector> dcov2d_data{ - {dL_dcov2d.x, dL_dcov2d_ref.x}, - {dL_dcov2d.y, dL_dcov2d_ref.y}, - {dL_dcov2d.z, dL_dcov2d_ref.z}, - }; - print_errors(dcov2d_data, "dcov2d (conic)", check_index); -} - -void compare_cov3d_backward(int check_index) { - float3 scale = {random_float(), random_float(), random_float()}; - float4 quat = random_quat(); - float4 quat_ref = quat; - // float4 quat_ref = {quat.w, quat.x, quat.y, quat.z}; - float dL_dcov3d[] = { - random_float(), - random_float(), - random_float(), - random_float(), - random_float(), - random_float()}; - float3 dL_ds = {0.f, 0.f, 0.f}; - float4 dL_dq = {0.f, 0.f, 0.f, 0.f}; - scale_rot_to_cov3d_vjp(scale, 1.f, quat, dL_dcov3d, dL_ds, dL_dq); - - float3 dL_ds_ref = {0.f, 0.f, 0.f}; - float4 dL_dq_ref = {0.f, 0.f, 0.f, 0.f}; - computeCov3DBackward(scale, 1.f, quat_ref, dL_dcov3d, dL_ds_ref, dL_dq_ref); - - // comparison - const std::vector> ds_data{ - {dL_ds.x, dL_ds_ref.x}, - {dL_ds.y, dL_ds_ref.y}, - {dL_ds.z, dL_ds_ref.z}, - }; - print_errors(ds_data, "ds (cov3d)", check_index); - - const std::vector> dquat_data{ - {dL_dq.x, dL_dq_ref.x}, - {dL_dq.y, dL_dq_ref.y}, - {dL_dq.z, dL_dq_ref.z}, - {dL_dq.w, dL_dq_ref.w}, - }; - print_errors(dquat_data, "dquat (cov3d)", check_index); -} - -void compare_cov2d_ewa_backward(int check_index) { - float3 mean = {random_float(), random_float(), random_float()}; - float3 scale = {random_float(), random_float(), random_float()}; - float4 quat = random_quat(); - float cov3d[] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - scale_rot_to_cov3d(scale, 1.f, quat, cov3d); - float3 dL_dcov2d = {random_float(), random_float(), random_float()}; - float3 dL_dmean = {0.f, 0.f, 0.f}; - float3 dL_dmean_ref = {0.f, 0.f, 0.f}; - float dL_dcov[] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - float dL_dcov_ref[] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; - - // functions expect different view matrix convention - // clang-format off - float viewmat[] = { - 1.f, 0.f, 0.f, 0.f, - 0.f, 1.f, 0.f, 0.f, - 0.f, 0.f, 1.f, 10.f, - 0.f, 0.f, 0.f, 1.f - }; - float viewmat_ref[] = { - 1.f, 0.f, 0.f, 0.f, - 0.f, 1.f, 0.f, 0.f, - 0.f, 0.f, 1.f, 0.f, - 0.f, 0.f, 10.f, 1.f - }; - // clang-format on - computeCov2DBackward( - mean, - cov3d, - viewmat_ref, - 1.f, - 1.f, - 1.f, - 1.f, - dL_dcov2d, - dL_dmean_ref, - dL_dcov_ref - ); - project_cov3d_ewa_vjp( - mean, cov3d, viewmat, 1.f, 1.f, dL_dcov2d, dL_dmean, dL_dcov - ); - - // comparison - const std::vector> dmean_data{ - {dL_dmean.x, dL_dmean_ref.x}, - {dL_dmean.x, dL_dmean_ref.x}, - {dL_dmean.x, dL_dmean_ref.x}, - }; - print_errors(dmean_data, "dmean (cov2d_ewa)", check_index); - - std::vector> dcov_data; - for (int i = 0; i < 6; ++i) - dcov_data.push_back({dL_dcov[i], dL_dcov_ref[i]}); - - print_errors(dcov_data, "dcov (cov2d_ewa)", check_index); -} - -void compare_rasterize_backward(const int N, int check_index) { - float2 p = {0.f, 0.f}; - float T_final = 5e-3; - constexpr int C = 3; - float dL_dout[C]; - for (int i = 0; i < C; ++i) { - dL_dout[i] = random_float(); - } - - float2 xys[N]; - float3 conics[N]; - float opacities[N]; - float4 conics_o[N]; - float rgbs[C * N]; - for (int i = 0; i < N; ++i) { - float v = (float)i - (float)N * 0.5f; - xys[i] = {0.1f * v, 0.1f * v}; - conics[i] = {10.f, 0.f, 10.f}; - opacities[i] = 0.5f; - conics_o[i] = {conics[i].x, conics[i].y, conics[i].z, opacities[i]}; - for (int c = 0; c < C; ++c) { - rgbs[C * i + c] = random_float(); - } - } - float dL_drgb[C * N] = {0.f}; - float dL_drgb_ref[C * N] = {0.f}; - float dL_do[N] = {0.f}; - float dL_do_ref[N] = {0.f}; - float2 dL_dm[N] = {0.f}; - float2 dL_dm_ref[N] = {0.f}; - float3 dL_dc[N] = {0.f}; - float3 dL_dc_ref[N] = {0.f}; - - rasterize_vjp( - N, - p, - C, - xys, - conics, - opacities, - rgbs, - T_final, - dL_dout, - dL_drgb, - dL_do, - dL_dm, - dL_dc - ); - rasterizeBackward( - N, - 2, - 2, - p, - xys, - conics_o, - rgbs, - T_final, - dL_dout, - dL_drgb_ref, - dL_do_ref, - dL_dm_ref, - dL_dc_ref - ); - - // comparison - std::vector> drgb_data; - for (int i = 0; i < C * N; ++i) - drgb_data.push_back({dL_drgb[i], dL_drgb_ref[i]}); - print_errors(drgb_data, "drgb (rasterize)", check_index); - - std::vector> do_data; - for (int i = 0; i < N; ++i) - do_data.push_back({dL_do[i], dL_do_ref[i]}); - print_errors(do_data, "do (rasterize)", check_index); - - std::vector> dm_data; - for (int i = 0; i < N; ++i) { - dm_data.push_back({dL_dm[i].x, dL_dm_ref[i].x}); - dm_data.push_back({dL_dm[i].y, dL_dm_ref[i].y}); - } - print_errors(dm_data, "dm (rasterize)", check_index); - - std::vector> dc_data; - for (int i = 0; i < N; ++i) { - dc_data.push_back({dL_dc[i].x, dL_dc_ref[i].x}); - dc_data.push_back({dL_dc[i].y, dL_dc_ref[i].y}); - dc_data.push_back({dL_dc[i].z, dL_dc_ref[i].z}); - } - print_errors(dc_data, "dc (rasterize)", check_index); -} - -int main(int argc, char *argv[]) { - int num_iters = 1000; - if (argc > 1) { - num_iters = std::stoi(argv[1]); - } - int num_points = 8; - if (argc > 2) { - num_points = std::stoi(argv[2]); - } - for (int x = 0; x < num_iters; x++) { - compare_project2d_mean_backward(x); - compare_conic_backward(x); - compare_cov3d_backward(x); - compare_cov2d_ewa_backward(x); - compare_rasterize_backward(num_points, x); - } - return 0; -} diff --git a/csrc/check_serial_forward.cu b/csrc/check_serial_forward.cu deleted file mode 100644 index 54aa8d651..000000000 --- a/csrc/check_serial_forward.cu +++ /dev/null @@ -1,482 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include "forward.cuh" -#include "helpers.cuh" - -#include "reference/cuda_rasterizer/forward.cu" - -float random_float() { return (float)std::rand() / RAND_MAX; } - -float4 random_quat() { - float u = random_float(); - float v = random_float(); - float w = random_float(); - return { - sqrt(1.f - u) * sin(2.f * (float)M_PI * v), - sqrt(1.f - u) * cos(2.f * (float)M_PI * v), - sqrt(u) * sin(2.f * (float)M_PI * w), - sqrt(u) * cos(2.f * (float)M_PI * w)}; -} - -// Host version of ref preprocessCUDA -__host__ void host_preprocessCUDA( - int P, - const float *orig_points, - const glm::vec3 *scales, - const float scale_modifier, - const glm::vec4 *rotations, - const float *opacities, - const float *shs, - bool *clamped, - const float *cov3D_precomp, - const float *colors_precomp, - const float *viewmatrix, - const float *projmatrix, - const glm::vec3 *cam_pos, - const int W, - int H, - const float tan_fovx, - float tan_fovy, - const float focal_x, - float focal_y, - int *radii, - float2 *points_xy_image, - float *depths, - float *cov3Ds, - float *rgb, - float4 *conic_opacity, - const dim3 grid, // tile_bounds - uint32_t *tiles_touched -) { - auto idx = 0; - if (idx >= P) - return; - int C = 3; - // Initialize radius and touched tiles to 0. If this isn't changed, - // this Gaussian will not be processed further. - radii[idx] = 0; - tiles_touched[idx] = 0; - - // Perform near culling, quit if outside. - float3 p_view; - if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, false, p_view)) - return; - - // printf("ref: p_view %d %.2f %.2f %.2f\n", idx, p_view.x, p_view.y, - // p_view.z); - - // Transform point by projecting - float3 p_orig = { - orig_points[3 * idx], - orig_points[3 * idx + 1], - orig_points[3 * idx + 2]}; - float4 p_hom = transformPoint4x4(p_orig, projmatrix); - float p_w = 1.0f / (p_hom.w + 0.0000001f); - float3 p_proj = {p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w}; - // printf("reff rast: p_proj %.2f %.2f \n", p_proj.x, p_proj.y); - - // If 3D covariance matrix is precomputed, use it, otherwise compute - // from scaling and rotation parameters. - const float *cov3D; - if (cov3D_precomp != nullptr) { - cov3D = cov3D_precomp + idx * 6; - } else { - computeCov3D( - scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6 - ); - cov3D = cov3Ds + idx * 6; - } - // Compute 2D screen-space covariance matrix - float3 cov = computeCov2D( - p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D, viewmatrix - ); - // printf("ref: cov2d %d, %.2f %.2f %.2f\n", idx, cov.x, cov.y, cov.z); - - // Invert covariance (EWA algorithm) - float det = (cov.x * cov.z - cov.y * cov.y); - if (det == 0.0f) - return; - float det_inv = 1.f / det; - float3 conic = {cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv}; - // printf("ref: conic %d %.2f %.2f %.2f\n", idx, conic.x, conic.y, conic.z); - - // Compute extent in screen space (by finding eigenvalues of - // 2D covariance matrix). Use extent to compute a bounding rectangle - // of screen-space tiles that this Gaussian overlaps with. Quit if - // rectangle covers 0 tiles. - float mid = 0.5f * (cov.x + cov.z); - float lambda1 = mid + sqrt(max(0.1f, mid * mid - det)); - float lambda2 = mid - sqrt(max(0.1f, mid * mid - det)); - float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2))); - float2 point_image = {ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H)}; - // printf("ref: point_image %d %.2f %.2f \n", idx, point_image.x, - // point_image.y); - uint2 rect_min, rect_max; - // printf("grid %d %d \n", grid.x, grid.y); - getRect(point_image, my_radius, rect_min, rect_max, grid); - if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0) - // return; - - // If colors have been precomputed, use them, otherwise convert - // spherical harmonics coefficients to RGB color. - if (colors_precomp == nullptr) { - // glm::vec3 result = computeColorFromSH(idx, D, M, - // (glm::vec3*)orig_points, *cam_pos, shs, clamped); rgb[idx * C + - // 0] = result.x; rgb[idx * C + 1] = result.y; rgb[idx * C + 2] = - // result.z; - } - - // Store some useful helper data for the next steps. - depths[idx] = p_view.z; - radii[idx] = my_radius; - points_xy_image[idx] = point_image; - // Inverse 2D covariance and opacity neatly pack into one float4 - // conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] }; - tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x); - int32_t tile_area = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x); - printf( - "ref rast: point %d x %.2f y %.2f z %.2f, radius %d, # tiles %d, " - "tile_min %d %d, tile_max %d %d \n", - idx, - point_image.x, - point_image.y, - depths[idx], - radii[idx], - tile_area, - rect_min.x, - rect_min.y, - rect_max.x, - rect_max.y - ); -} - -// Host version of diff_rast projection kernel -__host__ __device__ void host_project_gaussians_forward_kernel( - const int num_points, - const float3 *means3d, - const float3 *scales, - const float glob_scale, - const float4 *quats, - const float *viewmat, - const float *projmat, - const float fx, - const float fy, - const dim3 img_size, - const dim3 tile_bounds, - float *covs3d, - float2 *xys, - float *depths, - int *radii, - float3 *conics, - int32_t *num_tiles_hit -) { - // unsigned idx = cg::this_grid().thread_rank(); // idx of thread within - // grid - int idx = 0; - radii[idx] = 0; - num_tiles_hit[idx] = 0; - if (idx >= num_points) { - return; - } - - float3 p_world = means3d[idx]; - // printf("diff rast: p_world %d %.2f %.2f %.2f\n", idx, p_world.x, - // p_world.y, p_world.z); - float3 p_view; - if (clip_near_plane(p_world, viewmat, p_view)) { - // printf("%d is out of frustum z %.2f, returning\n", idx, p_view.z); - return; - } - // printf("diff rast: p_view %d %.2f %.2f %.2f\n", idx, p_view.x, p_view.y, - // p_view.z); - - // compute the projected covariance - float3 scale = scales[idx]; - float4 quat = quats[idx]; - // printf("%d scale %.2f %.2f %.2f\n", idx, scale.x, scale.y, scale.z); - // printf("%d quat %.2f %.2f %.2f %.2f\n", idx, quat.w, quat.x, quat.y, - // quat.z); - float *cur_cov3d = &(covs3d[6 * idx]); - scale_rot_to_cov3d(scale, glob_scale, quat, cur_cov3d); - - // project to 2d with ewa approximation - float tan_fovx = 0.5 * img_size.x / fx; - float tan_fovy = 0.5 * img_size.y / fy; - float3 cov2d = project_cov3d_ewa(p_world, cur_cov3d, viewmat, fx, fy, tan_fovx, tan_fovy); - //printf("diff rast: cov2d %d, %.2f %.2f %.2f\n", idx, cov2d.x, cov2d.y, cov2d.z); - - float3 conic; - float radius; - bool ok = compute_cov2d_bounds(cov2d, conic, radius); - if (!ok) - return; // zero determinant - // printf("diff rast: conic %d %.2f %.2f %.2f\n", idx, conic.x, conic.y, - // conic.z); - conics[idx] = conic; - - // compute the projected mean - float2 center = project_pix(projmat, p_world, img_size); - // printf("diff rast: point_image %d %.2f %.2f \n", idx, center.x, - // center.y); - uint2 tile_min, tile_max; - get_tile_bbox(center, radius, tile_bounds, tile_min, tile_max); - // printf("diff rast: tile_min %d %d \n", tile_min.x, tile_min.y); - int32_t tile_area = (tile_max.x - tile_min.x) * (tile_max.y - tile_min.y); - - num_tiles_hit[idx] = tile_area; - depths[idx] = p_view.z; - radii[idx] = (int)radius; - xys[idx] = center; - printf( - "diff rast: point %d x %.2f y %.2f z %.2f, radius %d, # tiles %d, " - "tile_min %d %d, tile_max %d %d \n", - idx, - center.x, - center.y, - depths[idx], - radii[idx], - tile_area, - tile_min.x, - tile_min.y, - tile_max.x, - tile_max.y - ); -} - -void compare_cov2d_forward() { - // TODO: test with more than one point and varying cov3d/viewmat - int num_points = 1; - float fx = 1; - float fy = 1; - const int W = 256; - const int H = 256 * 2; - float tan_fovx = 0.5 * W / fx; - float tan_fovy = 0.5 * H / fy; - float viewmat[] = { - 1.f, - 0.f, - 0.f, - 0.f, - 0.f, - 1.f, - 0.f, - 0.f, - 0.f, - 0.f, - 1.f, - 8.f, - 0.f, - 0.f, - 0.f, - 1.f}; - float viewmatColumnMajor[] = { - 1.0f, - 0.0f, - 0.0f, - 0.0f, - 0.0f, - 1.0f, - 0.0f, - 0.0f, - 0.0f, - 0.0f, - 1.0f, - 0.0f, - 0.0f, - 0.0f, - 8.0f, - 1.0f}; - float cov3d[]{1.f, 0.f, 0.f, 1.f, 0.f, 1.f}; - float3 mean = {1.f, 1.f, 1.f}; - - // diff rast - float3 diff_rast_cov2d = project_cov3d_ewa(mean,cov3d, viewmat, fx, fy, tan_fovx, tan_fovy); - std::cout<<"diff rast cov2d: "< -#include - -float percent_error(float exp_val, float true_val) { - return (abs(true_val - exp_val) / true_val ) * 100.f; -} - -std::vector percent_errors( - const std::vector>& values -) { - std::vector p_errors; - - for (int i = 0; i < values.size(); ++i) { - const float exp_val = values[i].first; - const float true_val = values[i].second; - const float p_error{ percent_error(exp_val, true_val) }; - p_errors.push_back(p_error); - } - - return p_errors; -} - -void print_errors( - const std::vector> &values, - const std::string &name, - int check_index, - float error_threshold // by default percent error (0.01%) -) { - const std::vector p_errors{percent_errors(values)}; - - // only print if input is unexpected - bool data_within_error{true}; - for (float p : p_errors) - if (p > error_threshold) - data_within_error = false; - - if (data_within_error) - return; - - // format cout for how we want to display data - std::cout << std::setprecision(2); - - std::cout << "Error on check: " << check_index << '\n' - << name << ":\n" - << " ours: refs:\n"; - - for (int i = 0; i < values.size(); ++i) { - const float d1 = values[i].first; - const float d2 = values[i].second; - std::cout << '[' << i << "]: " << std::scientific - << std::setw(10) << d1 << ", " - << std::setw(10) << d2 - << "\t(percent error=" << std::fixed << p_errors[i] << ")\n"; - } - std::cout << '\n'; -} \ No newline at end of file diff --git a/csrc/debug_utils.h b/csrc/debug_utils.h deleted file mode 100644 index e6409cca6..000000000 --- a/csrc/debug_utils.h +++ /dev/null @@ -1,23 +0,0 @@ -#pragma once - -#include -#include -#include - -float percent_error(float exp_val, float true_val); - -// takes a vector of float pairs, where first value is experimental, -// and second is true value -std::vector percent_errors( - const std::vector>& values -); - -// takes a vector of float pairs, where pair's first value is -// "experimental value", and second is "true value". If any pair's percent exceeds -// error threshold, the entire vector will be displayed in a nice lil format -void print_errors( - const std::vector> &values, - const std::string &name, - int check_index, - float error_threshold = 0.01f // percent error (0.01%) -); \ No newline at end of file diff --git a/csrc/main.cpp b/csrc/main.cpp deleted file mode 100644 index 83da83f72..000000000 --- a/csrc/main.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include "tgaimage.h" -int main() { - TGAColor red; - red[2] = 255; - red[3] = 255; - TGAImage image(100, 100, TGAImage::RGB); - image.set(52, 41, red); - image.flip_vertically( - ); // i want to have the origin at the left bottom corner of the image - image.write_tga_file("output.tga"); - return 0; -} diff --git a/csrc/reference/cuda_rasterizer/auxiliary.h b/csrc/reference/cuda_rasterizer/auxiliary.h deleted file mode 100644 index 84d8391cd..000000000 --- a/csrc/reference/cuda_rasterizer/auxiliary.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright (C) 2023, Inria - * GRAPHDECO research group, https://team.inria.fr/graphdeco - * All rights reserved. - * - * This software is free for non-commercial, research and evaluation use - * under the terms of the LICENSE.md file. - * - * For inquiries contact george.drettakis@inria.fr - */ - -#ifndef CUDA_RASTERIZER_AUXILIARY_H_INCLUDED -#define CUDA_RASTERIZER_AUXILIARY_H_INCLUDED - -#include "config.h" -#include "stdio.h" - -#define BLOCK_SIZE (BLOCK_X * BLOCK_Y) -#define NUM_WARPS (BLOCK_SIZE/32) - -// Spherical harmonics coefficients -__device__ const float SH_C0 = 0.28209479177387814f; -__device__ const float SH_C1 = 0.4886025119029199f; -__device__ const float SH_C2[] = { - 1.0925484305920792f, - -1.0925484305920792f, - 0.31539156525252005f, - -1.0925484305920792f, - 0.5462742152960396f -}; -__device__ const float SH_C3[] = { - -0.5900435899266435f, - 2.890611442640554f, - -0.4570457994644658f, - 0.3731763325901154f, - -0.4570457994644658f, - 1.445305721320277f, - -0.5900435899266435f -}; - -__forceinline__ __host__ __device__ float ndc2Pix(float v, int S) -{ - return ((v + 1.0) * S - 1.0) * 0.5; -} - -__forceinline__ __host__ __device__ void getRect(const float2 p, int max_radius, uint2& rect_min, uint2& rect_max, dim3 grid) -{ - rect_min = { - min(grid.x, max((int)0, (int)((p.x - max_radius) / BLOCK_X))), - min(grid.y, max((int)0, (int)((p.y - max_radius) / BLOCK_Y))) - }; - rect_max = { - min(grid.x, max((int)0, (int)((p.x + max_radius + BLOCK_X - 1) / BLOCK_X))), - min(grid.y, max((int)0, (int)((p.y + max_radius + BLOCK_Y - 1) / BLOCK_Y))) - }; -} - -__forceinline__ __host__ __device__ float3 transformPoint4x3(const float3& p, const float* matrix) -{ - float3 transformed = { - matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12], - matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13], - matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14], - }; - return transformed; -} - -__forceinline__ __host__ __device__ float4 transformPoint4x4(const float3& p, const float* matrix) -{ - float4 transformed = { - matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12], - matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13], - matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14], - matrix[3] * p.x + matrix[7] * p.y + matrix[11] * p.z + matrix[15] - }; - return transformed; -} - -__forceinline__ __device__ float3 transformVec4x3(const float3& p, const float* matrix) -{ - float3 transformed = { - matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z, - matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z, - matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z, - }; - return transformed; -} - -__forceinline__ __device__ float3 transformVec4x3Transpose(const float3& p, const float* matrix) -{ - float3 transformed = { - matrix[0] * p.x + matrix[1] * p.y + matrix[2] * p.z, - matrix[4] * p.x + matrix[5] * p.y + matrix[6] * p.z, - matrix[8] * p.x + matrix[9] * p.y + matrix[10] * p.z, - }; - return transformed; -} - -__forceinline__ __device__ float dnormvdz(float3 v, float3 dv) -{ - float sum2 = v.x * v.x + v.y * v.y + v.z * v.z; - float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2); - float dnormvdz = (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) * invsum32; - return dnormvdz; -} - -__forceinline__ __device__ float3 dnormvdv(float3 v, float3 dv) -{ - float sum2 = v.x * v.x + v.y * v.y + v.z * v.z; - float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2); - - float3 dnormvdv; - dnormvdv.x = ((+sum2 - v.x * v.x) * dv.x - v.y * v.x * dv.y - v.z * v.x * dv.z) * invsum32; - dnormvdv.y = (-v.x * v.y * dv.x + (sum2 - v.y * v.y) * dv.y - v.z * v.y * dv.z) * invsum32; - dnormvdv.z = (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) * invsum32; - return dnormvdv; -} - -__forceinline__ __device__ float4 dnormvdv(float4 v, float4 dv) -{ - float sum2 = v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w; - float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2); - - float4 vdv = { v.x * dv.x, v.y * dv.y, v.z * dv.z, v.w * dv.w }; - float vdv_sum = vdv.x + vdv.y + vdv.z + vdv.w; - float4 dnormvdv; - dnormvdv.x = ((sum2 - v.x * v.x) * dv.x - v.x * (vdv_sum - vdv.x)) * invsum32; - dnormvdv.y = ((sum2 - v.y * v.y) * dv.y - v.y * (vdv_sum - vdv.y)) * invsum32; - dnormvdv.z = ((sum2 - v.z * v.z) * dv.z - v.z * (vdv_sum - vdv.z)) * invsum32; - dnormvdv.w = ((sum2 - v.w * v.w) * dv.w - v.w * (vdv_sum - vdv.w)) * invsum32; - return dnormvdv; -} - -__forceinline__ __device__ float sigmoid(float x) -{ - return 1.0f / (1.0f + expf(-x)); -} - -__forceinline__ __host__ __device__ bool in_frustum(int idx, - const float* orig_points, - const float* viewmatrix, - const float* projmatrix, - bool prefiltered, - float3& p_view) -{ - float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] }; - - // Bring points to screen space - float4 p_hom = transformPoint4x4(p_orig, projmatrix); - float p_w = 1.0f / (p_hom.w + 0.0000001f); - float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w }; - p_view = transformPoint4x3(p_orig, viewmatrix); - - if (p_view.z <= 0.2f)// || ((p_proj.x < -1.3 || p_proj.x > 1.3 || p_proj.y < -1.3 || p_proj.y > 1.3))) - { - if (prefiltered) - { - printf("Point is filtered although prefiltered is set. This shouldn't happen!"); - //__trap(); - } - return false; - } - return true; -} - -#define CHECK_CUDA(A, debug) \ -A; if(debug) { \ -auto ret = cudaDeviceSynchronize(); \ -if (ret != cudaSuccess) { \ -std::cerr << "\n[CUDA ERROR] in " << __FILE__ << "\nLine " << __LINE__ << ": " << cudaGetErrorString(ret); \ -throw std::runtime_error(cudaGetErrorString(ret)); \ -} \ -} - -#endif \ No newline at end of file diff --git a/csrc/reference/cuda_rasterizer/backward.cu b/csrc/reference/cuda_rasterizer/backward.cu deleted file mode 100644 index 4aa41e1cb..000000000 --- a/csrc/reference/cuda_rasterizer/backward.cu +++ /dev/null @@ -1,657 +0,0 @@ -/* - * Copyright (C) 2023, Inria - * GRAPHDECO research group, https://team.inria.fr/graphdeco - * All rights reserved. - * - * This software is free for non-commercial, research and evaluation use - * under the terms of the LICENSE.md file. - * - * For inquiries contact george.drettakis@inria.fr - */ - -#include "backward.h" -#include "auxiliary.h" -#include -#include -namespace cg = cooperative_groups; - -// Backward pass for conversion of spherical harmonics to RGB for -// each Gaussian. -__device__ void computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, const bool* clamped, const glm::vec3* dL_dcolor, glm::vec3* dL_dmeans, glm::vec3* dL_dshs) -{ - // Compute intermediate values, as it is done during forward - glm::vec3 pos = means[idx]; - glm::vec3 dir_orig = pos - campos; - glm::vec3 dir = dir_orig / glm::length(dir_orig); - - glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs; - - // Use PyTorch rule for clamping: if clamping was applied, - // gradient becomes 0. - glm::vec3 dL_dRGB = dL_dcolor[idx]; - dL_dRGB.x *= clamped[3 * idx + 0] ? 0 : 1; - dL_dRGB.y *= clamped[3 * idx + 1] ? 0 : 1; - dL_dRGB.z *= clamped[3 * idx + 2] ? 0 : 1; - - glm::vec3 dRGBdx(0, 0, 0); - glm::vec3 dRGBdy(0, 0, 0); - glm::vec3 dRGBdz(0, 0, 0); - float x = dir.x; - float y = dir.y; - float z = dir.z; - - // Target location for this Gaussian to write SH gradients to - glm::vec3* dL_dsh = dL_dshs + idx * max_coeffs; - - // No tricks here, just high school-level calculus. - float dRGBdsh0 = SH_C0; - dL_dsh[0] = dRGBdsh0 * dL_dRGB; - if (deg > 0) - { - float dRGBdsh1 = -SH_C1 * y; - float dRGBdsh2 = SH_C1 * z; - float dRGBdsh3 = -SH_C1 * x; - dL_dsh[1] = dRGBdsh1 * dL_dRGB; - dL_dsh[2] = dRGBdsh2 * dL_dRGB; - dL_dsh[3] = dRGBdsh3 * dL_dRGB; - - dRGBdx = -SH_C1 * sh[3]; - dRGBdy = -SH_C1 * sh[1]; - dRGBdz = SH_C1 * sh[2]; - - if (deg > 1) - { - float xx = x * x, yy = y * y, zz = z * z; - float xy = x * y, yz = y * z, xz = x * z; - - float dRGBdsh4 = SH_C2[0] * xy; - float dRGBdsh5 = SH_C2[1] * yz; - float dRGBdsh6 = SH_C2[2] * (2.f * zz - xx - yy); - float dRGBdsh7 = SH_C2[3] * xz; - float dRGBdsh8 = SH_C2[4] * (xx - yy); - dL_dsh[4] = dRGBdsh4 * dL_dRGB; - dL_dsh[5] = dRGBdsh5 * dL_dRGB; - dL_dsh[6] = dRGBdsh6 * dL_dRGB; - dL_dsh[7] = dRGBdsh7 * dL_dRGB; - dL_dsh[8] = dRGBdsh8 * dL_dRGB; - - dRGBdx += SH_C2[0] * y * sh[4] + SH_C2[2] * 2.f * -x * sh[6] + SH_C2[3] * z * sh[7] + SH_C2[4] * 2.f * x * sh[8]; - dRGBdy += SH_C2[0] * x * sh[4] + SH_C2[1] * z * sh[5] + SH_C2[2] * 2.f * -y * sh[6] + SH_C2[4] * 2.f * -y * sh[8]; - dRGBdz += SH_C2[1] * y * sh[5] + SH_C2[2] * 2.f * 2.f * z * sh[6] + SH_C2[3] * x * sh[7]; - - if (deg > 2) - { - float dRGBdsh9 = SH_C3[0] * y * (3.f * xx - yy); - float dRGBdsh10 = SH_C3[1] * xy * z; - float dRGBdsh11 = SH_C3[2] * y * (4.f * zz - xx - yy); - float dRGBdsh12 = SH_C3[3] * z * (2.f * zz - 3.f * xx - 3.f * yy); - float dRGBdsh13 = SH_C3[4] * x * (4.f * zz - xx - yy); - float dRGBdsh14 = SH_C3[5] * z * (xx - yy); - float dRGBdsh15 = SH_C3[6] * x * (xx - 3.f * yy); - dL_dsh[9] = dRGBdsh9 * dL_dRGB; - dL_dsh[10] = dRGBdsh10 * dL_dRGB; - dL_dsh[11] = dRGBdsh11 * dL_dRGB; - dL_dsh[12] = dRGBdsh12 * dL_dRGB; - dL_dsh[13] = dRGBdsh13 * dL_dRGB; - dL_dsh[14] = dRGBdsh14 * dL_dRGB; - dL_dsh[15] = dRGBdsh15 * dL_dRGB; - - dRGBdx += ( - SH_C3[0] * sh[9] * 3.f * 2.f * xy + - SH_C3[1] * sh[10] * yz + - SH_C3[2] * sh[11] * -2.f * xy + - SH_C3[3] * sh[12] * -3.f * 2.f * xz + - SH_C3[4] * sh[13] * (-3.f * xx + 4.f * zz - yy) + - SH_C3[5] * sh[14] * 2.f * xz + - SH_C3[6] * sh[15] * 3.f * (xx - yy)); - - dRGBdy += ( - SH_C3[0] * sh[9] * 3.f * (xx - yy) + - SH_C3[1] * sh[10] * xz + - SH_C3[2] * sh[11] * (-3.f * yy + 4.f * zz - xx) + - SH_C3[3] * sh[12] * -3.f * 2.f * yz + - SH_C3[4] * sh[13] * -2.f * xy + - SH_C3[5] * sh[14] * -2.f * yz + - SH_C3[6] * sh[15] * -3.f * 2.f * xy); - - dRGBdz += ( - SH_C3[1] * sh[10] * xy + - SH_C3[2] * sh[11] * 4.f * 2.f * yz + - SH_C3[3] * sh[12] * 3.f * (2.f * zz - xx - yy) + - SH_C3[4] * sh[13] * 4.f * 2.f * xz + - SH_C3[5] * sh[14] * (xx - yy)); - } - } - } - - // The view direction is an input to the computation. View direction - // is influenced by the Gaussian's mean, so SHs gradients - // must propagate back into 3D position. - glm::vec3 dL_ddir(glm::dot(dRGBdx, dL_dRGB), glm::dot(dRGBdy, dL_dRGB), glm::dot(dRGBdz, dL_dRGB)); - - // Account for normalization of direction - float3 dL_dmean = dnormvdv(float3{ dir_orig.x, dir_orig.y, dir_orig.z }, float3{ dL_ddir.x, dL_ddir.y, dL_ddir.z }); - - // Gradients of loss w.r.t. Gaussian means, but only the portion - // that is caused because the mean affects the view-dependent color. - // Additional mean gradient is accumulated in below methods. - dL_dmeans[idx] += glm::vec3(dL_dmean.x, dL_dmean.y, dL_dmean.z); -} - -// Backward version of INVERSE 2D covariance matrix computation -// (due to length launched as separate kernel before other -// backward steps contained in preprocess) -__global__ void computeCov2DCUDA(int P, - const float3* means, - const int* radii, - const float* cov3Ds, - const float h_x, float h_y, - const float tan_fovx, float tan_fovy, - const float* view_matrix, - const float* dL_dconics, - float3* dL_dmeans, - float* dL_dcov) -{ - auto idx = cg::this_grid().thread_rank(); - if (idx >= P || !(radii[idx] > 0)) - return; - - // Reading location of 3D covariance for this Gaussian - const float* cov3D = cov3Ds + 6 * idx; - - // Fetch gradients, recompute 2D covariance and relevant - // intermediate forward results needed in the backward. - float3 mean = means[idx]; - float3 dL_dconic = { dL_dconics[4 * idx], dL_dconics[4 * idx + 1], dL_dconics[4 * idx + 3] }; - float3 t = transformPoint4x3(mean, view_matrix); - - const float limx = 1.3f * tan_fovx; - const float limy = 1.3f * tan_fovy; - const float txtz = t.x / t.z; - const float tytz = t.y / t.z; - t.x = min(limx, max(-limx, txtz)) * t.z; - t.y = min(limy, max(-limy, tytz)) * t.z; - - const float x_grad_mul = txtz < -limx || txtz > limx ? 0 : 1; - const float y_grad_mul = tytz < -limy || tytz > limy ? 0 : 1; - - glm::mat3 J = glm::mat3(h_x / t.z, 0.0f, -(h_x * t.x) / (t.z * t.z), - 0.0f, h_y / t.z, -(h_y * t.y) / (t.z * t.z), - 0, 0, 0); - - glm::mat3 W = glm::mat3( - view_matrix[0], view_matrix[4], view_matrix[8], - view_matrix[1], view_matrix[5], view_matrix[9], - view_matrix[2], view_matrix[6], view_matrix[10]); - - glm::mat3 Vrk = glm::mat3( - cov3D[0], cov3D[1], cov3D[2], - cov3D[1], cov3D[3], cov3D[4], - cov3D[2], cov3D[4], cov3D[5]); - - glm::mat3 T = W * J; - - glm::mat3 cov2D = glm::transpose(T) * glm::transpose(Vrk) * T; - - // Use helper variables for 2D covariance entries. More compact. - float a = cov2D[0][0] += 0.3f; - float b = cov2D[0][1]; - float c = cov2D[1][1] += 0.3f; - - float denom = a * c - b * b; - float dL_da = 0, dL_db = 0, dL_dc = 0; - float denom2inv = 1.0f / ((denom * denom) + 0.0000001f); - - if (denom2inv != 0) - { - // Gradients of loss w.r.t. entries of 2D covariance matrix, - // given gradients of loss w.r.t. conic matrix (inverse covariance matrix). - // e.g., dL / da = dL / d_conic_a * d_conic_a / d_a - dL_da = denom2inv * (-c * c * dL_dconic.x + 2 * b * c * dL_dconic.y + (denom - a * c) * dL_dconic.z); - dL_dc = denom2inv * (-a * a * dL_dconic.z + 2 * a * b * dL_dconic.y + (denom - a * c) * dL_dconic.x); - dL_db = denom2inv * 2 * (b * c * dL_dconic.x - (denom + 2 * b * b) * dL_dconic.y + a * b * dL_dconic.z); - - // Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry, - // given gradients w.r.t. 2D covariance matrix (diagonal). - // cov2D = transpose(T) * transpose(Vrk) * T; - dL_dcov[6 * idx + 0] = (T[0][0] * T[0][0] * dL_da + T[0][0] * T[1][0] * dL_db + T[1][0] * T[1][0] * dL_dc); - dL_dcov[6 * idx + 3] = (T[0][1] * T[0][1] * dL_da + T[0][1] * T[1][1] * dL_db + T[1][1] * T[1][1] * dL_dc); - dL_dcov[6 * idx + 5] = (T[0][2] * T[0][2] * dL_da + T[0][2] * T[1][2] * dL_db + T[1][2] * T[1][2] * dL_dc); - - // Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry, - // given gradients w.r.t. 2D covariance matrix (off-diagonal). - // Off-diagonal elements appear twice --> double the gradient. - // cov2D = transpose(T) * transpose(Vrk) * T; - dL_dcov[6 * idx + 1] = 2 * T[0][0] * T[0][1] * dL_da + (T[0][0] * T[1][1] + T[0][1] * T[1][0]) * dL_db + 2 * T[1][0] * T[1][1] * dL_dc; - dL_dcov[6 * idx + 2] = 2 * T[0][0] * T[0][2] * dL_da + (T[0][0] * T[1][2] + T[0][2] * T[1][0]) * dL_db + 2 * T[1][0] * T[1][2] * dL_dc; - dL_dcov[6 * idx + 4] = 2 * T[0][2] * T[0][1] * dL_da + (T[0][1] * T[1][2] + T[0][2] * T[1][1]) * dL_db + 2 * T[1][1] * T[1][2] * dL_dc; - } - else - { - for (int i = 0; i < 6; i++) - dL_dcov[6 * idx + i] = 0; - } - - // Gradients of loss w.r.t. upper 2x3 portion of intermediate matrix T - // cov2D = transpose(T) * transpose(Vrk) * T; - float dL_dT00 = 2 * (T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) * dL_da + - (T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) * dL_db; - float dL_dT01 = 2 * (T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) * dL_da + - (T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) * dL_db; - float dL_dT02 = 2 * (T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) * dL_da + - (T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) * dL_db; - float dL_dT10 = 2 * (T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) * dL_dc + - (T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) * dL_db; - float dL_dT11 = 2 * (T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) * dL_dc + - (T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) * dL_db; - float dL_dT12 = 2 * (T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) * dL_dc + - (T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) * dL_db; - - // Gradients of loss w.r.t. upper 3x2 non-zero entries of Jacobian matrix - // T = W * J - float dL_dJ00 = W[0][0] * dL_dT00 + W[0][1] * dL_dT01 + W[0][2] * dL_dT02; - float dL_dJ02 = W[2][0] * dL_dT00 + W[2][1] * dL_dT01 + W[2][2] * dL_dT02; - float dL_dJ11 = W[1][0] * dL_dT10 + W[1][1] * dL_dT11 + W[1][2] * dL_dT12; - float dL_dJ12 = W[2][0] * dL_dT10 + W[2][1] * dL_dT11 + W[2][2] * dL_dT12; - - float tz = 1.f / t.z; - float tz2 = tz * tz; - float tz3 = tz2 * tz; - - // Gradients of loss w.r.t. transformed Gaussian mean t - float dL_dtx = x_grad_mul * -h_x * tz2 * dL_dJ02; - float dL_dty = y_grad_mul * -h_y * tz2 * dL_dJ12; - float dL_dtz = -h_x * tz2 * dL_dJ00 - h_y * tz2 * dL_dJ11 + (2 * h_x * t.x) * tz3 * dL_dJ02 + (2 * h_y * t.y) * tz3 * dL_dJ12; - - // Account for transformation of mean to t - // t = transformPoint4x3(mean, view_matrix); - float3 dL_dmean = transformVec4x3Transpose({ dL_dtx, dL_dty, dL_dtz }, view_matrix); - - // Gradients of loss w.r.t. Gaussian means, but only the portion - // that is caused because the mean affects the covariance matrix. - // Additional mean gradient is accumulated in BACKWARD::preprocess. - dL_dmeans[idx] = dL_dmean; -} - -// Backward pass for the conversion of scale and rotation to a -// 3D covariance matrix for each Gaussian. -__device__ void computeCov3D(int idx, const glm::vec3 scale, float mod, const glm::vec4 rot, const float* dL_dcov3Ds, glm::vec3* dL_dscales, glm::vec4* dL_drots) -{ - // Recompute (intermediate) results for the 3D covariance computation. - glm::vec4 q = rot;// / glm::length(rot); - float r = q.x; - float x = q.y; - float y = q.z; - float z = q.w; - - glm::mat3 R = glm::mat3( - 1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y), - 2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x), - 2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y) - ); - - glm::mat3 S = glm::mat3(1.0f); - - glm::vec3 s = mod * scale; - S[0][0] = s.x; - S[1][1] = s.y; - S[2][2] = s.z; - - glm::mat3 M = S * R; - - const float* dL_dcov3D = dL_dcov3Ds + 6 * idx; - - glm::vec3 dunc(dL_dcov3D[0], dL_dcov3D[3], dL_dcov3D[5]); - glm::vec3 ounc = 0.5f * glm::vec3(dL_dcov3D[1], dL_dcov3D[2], dL_dcov3D[4]); - - // Convert per-element covariance loss gradients to matrix form - glm::mat3 dL_dSigma = glm::mat3( - dL_dcov3D[0], 0.5f * dL_dcov3D[1], 0.5f * dL_dcov3D[2], - 0.5f * dL_dcov3D[1], dL_dcov3D[3], 0.5f * dL_dcov3D[4], - 0.5f * dL_dcov3D[2], 0.5f * dL_dcov3D[4], dL_dcov3D[5] - ); - - // Compute loss gradient w.r.t. matrix M - // dSigma_dM = 2 * M - glm::mat3 dL_dM = 2.0f * M * dL_dSigma; - - glm::mat3 Rt = glm::transpose(R); - glm::mat3 dL_dMt = glm::transpose(dL_dM); - - // Gradients of loss w.r.t. scale - glm::vec3* dL_dscale = dL_dscales + idx; - dL_dscale->x = glm::dot(Rt[0], dL_dMt[0]); - dL_dscale->y = glm::dot(Rt[1], dL_dMt[1]); - dL_dscale->z = glm::dot(Rt[2], dL_dMt[2]); - - dL_dMt[0] *= s.x; - dL_dMt[1] *= s.y; - dL_dMt[2] *= s.z; - - // Gradients of loss w.r.t. normalized quaternion - glm::vec4 dL_dq; - dL_dq.x = 2 * z * (dL_dMt[0][1] - dL_dMt[1][0]) + 2 * y * (dL_dMt[2][0] - dL_dMt[0][2]) + 2 * x * (dL_dMt[1][2] - dL_dMt[2][1]); - dL_dq.y = 2 * y * (dL_dMt[1][0] + dL_dMt[0][1]) + 2 * z * (dL_dMt[2][0] + dL_dMt[0][2]) + 2 * r * (dL_dMt[1][2] - dL_dMt[2][1]) - 4 * x * (dL_dMt[2][2] + dL_dMt[1][1]); - dL_dq.z = 2 * x * (dL_dMt[1][0] + dL_dMt[0][1]) + 2 * r * (dL_dMt[2][0] - dL_dMt[0][2]) + 2 * z * (dL_dMt[1][2] + dL_dMt[2][1]) - 4 * y * (dL_dMt[2][2] + dL_dMt[0][0]); - dL_dq.w = 2 * r * (dL_dMt[0][1] - dL_dMt[1][0]) + 2 * x * (dL_dMt[2][0] + dL_dMt[0][2]) + 2 * y * (dL_dMt[1][2] + dL_dMt[2][1]) - 4 * z * (dL_dMt[1][1] + dL_dMt[0][0]); - - // Gradients of loss w.r.t. unnormalized quaternion - float4* dL_drot = (float4*)(dL_drots + idx); - *dL_drot = float4{ dL_dq.x, dL_dq.y, dL_dq.z, dL_dq.w };//dnormvdv(float4{ rot.x, rot.y, rot.z, rot.w }, float4{ dL_dq.x, dL_dq.y, dL_dq.z, dL_dq.w }); -} - -// Backward pass of the preprocessing steps, except -// for the covariance computation and inversion -// (those are handled by a previous kernel call) -template -__global__ void preprocessCUDA( - int P, int D, int M, - const float3* means, - const int* radii, - const float* shs, - const bool* clamped, - const glm::vec3* scales, - const glm::vec4* rotations, - const float scale_modifier, - const float* proj, - const glm::vec3* campos, - const float3* dL_dmean2D, - glm::vec3* dL_dmeans, - float* dL_dcolor, - float* dL_dcov3D, - float* dL_dsh, - glm::vec3* dL_dscale, - glm::vec4* dL_drot) -{ - auto idx = cg::this_grid().thread_rank(); - if (idx >= P || !(radii[idx] > 0)) - return; - - float3 m = means[idx]; - - // Taking care of gradients from the screenspace points - float4 m_hom = transformPoint4x4(m, proj); - float m_w = 1.0f / (m_hom.w + 0.0000001f); - - // Compute loss gradient w.r.t. 3D means due to gradients of 2D means - // from rendering procedure - glm::vec3 dL_dmean; - float mul1 = (proj[0] * m.x + proj[4] * m.y + proj[8] * m.z + proj[12]) * m_w * m_w; - float mul2 = (proj[1] * m.x + proj[5] * m.y + proj[9] * m.z + proj[13]) * m_w * m_w; - dL_dmean.x = (proj[0] * m_w - proj[3] * mul1) * dL_dmean2D[idx].x + (proj[1] * m_w - proj[3] * mul2) * dL_dmean2D[idx].y; - dL_dmean.y = (proj[4] * m_w - proj[7] * mul1) * dL_dmean2D[idx].x + (proj[5] * m_w - proj[7] * mul2) * dL_dmean2D[idx].y; - dL_dmean.z = (proj[8] * m_w - proj[11] * mul1) * dL_dmean2D[idx].x + (proj[9] * m_w - proj[11] * mul2) * dL_dmean2D[idx].y; - - // That's the second part of the mean gradient. Previous computation - // of cov2D and following SH conversion also affects it. - dL_dmeans[idx] += dL_dmean; - - // Compute gradient updates due to computing colors from SHs - if (shs) - computeColorFromSH(idx, D, M, (glm::vec3*)means, *campos, shs, clamped, (glm::vec3*)dL_dcolor, (glm::vec3*)dL_dmeans, (glm::vec3*)dL_dsh); - - // Compute gradient updates due to computing covariance from scale/rotation - if (scales) - computeCov3D(idx, scales[idx], scale_modifier, rotations[idx], dL_dcov3D, dL_dscale, dL_drot); -} - -// Backward version of the rendering procedure. -template -__global__ void __launch_bounds__(BLOCK_X * BLOCK_Y) -renderCUDA( - const uint2* __restrict__ ranges, - const uint32_t* __restrict__ point_list, - int W, int H, - const float* __restrict__ bg_color, - const float2* __restrict__ points_xy_image, - const float4* __restrict__ conic_opacity, - const float* __restrict__ colors, - const float* __restrict__ final_Ts, - const uint32_t* __restrict__ n_contrib, - const float* __restrict__ dL_dpixels, - float3* __restrict__ dL_dmean2D, - float4* __restrict__ dL_dconic2D, - float* __restrict__ dL_dopacity, - float* __restrict__ dL_dcolors) -{ - // We rasterize again. Compute necessary block info. - auto block = cg::this_thread_block(); - const uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X; - const uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y }; - const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) }; - const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y }; - const uint32_t pix_id = W * pix.y + pix.x; - const float2 pixf = { (float)pix.x, (float)pix.y }; - - const bool inside = pix.x < W&& pix.y < H; - const uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x]; - - const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE); - - bool done = !inside; - int toDo = range.y - range.x; - - __shared__ int collected_id[BLOCK_SIZE]; - __shared__ float2 collected_xy[BLOCK_SIZE]; - __shared__ float4 collected_conic_opacity[BLOCK_SIZE]; - __shared__ float collected_colors[C * BLOCK_SIZE]; - - // In the forward, we stored the final value for T, the - // product of all (1 - alpha) factors. - const float T_final = inside ? final_Ts[pix_id] : 0; - float T = T_final; - - // We start from the back. The ID of the last contributing - // Gaussian is known from each pixel from the forward. - uint32_t contributor = toDo; - const int last_contributor = inside ? n_contrib[pix_id] : 0; - - float accum_rec[C] = { 0 }; - float dL_dpixel[C]; - if (inside) - for (int i = 0; i < C; i++) - dL_dpixel[i] = dL_dpixels[i * H * W + pix_id]; - - float last_alpha = 0; - float last_color[C] = { 0 }; - - // Gradient of pixel coordinate w.r.t. normalized - // screen-space viewport corrdinates (-1 to 1) - const float ddelx_dx = 0.5 * W; - const float ddely_dy = 0.5 * H; - - // Traverse all Gaussians - for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE) - { - // Load auxiliary data into shared memory, start in the BACK - // and load them in revers order. - block.sync(); - const int progress = i * BLOCK_SIZE + block.thread_rank(); - if (range.x + progress < range.y) - { - const int coll_id = point_list[range.y - progress - 1]; - collected_id[block.thread_rank()] = coll_id; - collected_xy[block.thread_rank()] = points_xy_image[coll_id]; - collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id]; - for (int i = 0; i < C; i++) - collected_colors[i * BLOCK_SIZE + block.thread_rank()] = colors[coll_id * C + i]; - } - block.sync(); - - // Iterate over Gaussians - for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++) - { - // Keep track of current Gaussian ID. Skip, if this one - // is behind the last contributor for this pixel. - contributor--; - if (contributor >= last_contributor) - continue; - - // Compute blending values, as before. - const float2 xy = collected_xy[j]; - const float2 d = { xy.x - pixf.x, xy.y - pixf.y }; - const float4 con_o = collected_conic_opacity[j]; - const float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y; - if (power > 0.0f) - continue; - - const float G = exp(power); - const float alpha = min(0.99f, con_o.w * G); - if (alpha < 1.0f / 255.0f) - continue; - - T = T / (1.f - alpha); - const float dchannel_dcolor = alpha * T; - - // Propagate gradients to per-Gaussian colors and keep - // gradients w.r.t. alpha (blending factor for a Gaussian/pixel - // pair). - float dL_dalpha = 0.0f; - const int global_id = collected_id[j]; - for (int ch = 0; ch < C; ch++) - { - const float c = collected_colors[ch * BLOCK_SIZE + j]; - // Update last color (to be used in the next iteration) - accum_rec[ch] = last_alpha * last_color[ch] + (1.f - last_alpha) * accum_rec[ch]; - last_color[ch] = c; - - const float dL_dchannel = dL_dpixel[ch]; - dL_dalpha += (c - accum_rec[ch]) * dL_dchannel; - // Update the gradients w.r.t. color of the Gaussian. - // Atomic, since this pixel is just one of potentially - // many that were affected by this Gaussian. - atomicAdd(&(dL_dcolors[global_id * C + ch]), dchannel_dcolor * dL_dchannel); - } - dL_dalpha *= T; - // Update last alpha (to be used in the next iteration) - last_alpha = alpha; - - // Account for fact that alpha also influences how much of - // the background color is added if nothing left to blend - float bg_dot_dpixel = 0; - for (int i = 0; i < C; i++) - bg_dot_dpixel += bg_color[i] * dL_dpixel[i]; - dL_dalpha += (-T_final / (1.f - alpha)) * bg_dot_dpixel; - - - // Helpful reusable temporary variables - const float dL_dG = con_o.w * dL_dalpha; - const float gdx = G * d.x; - const float gdy = G * d.y; - const float dG_ddelx = -gdx * con_o.x - gdy * con_o.y; - const float dG_ddely = -gdy * con_o.z - gdx * con_o.y; - - // Update gradients w.r.t. 2D mean position of the Gaussian - atomicAdd(&dL_dmean2D[global_id].x, dL_dG * dG_ddelx * ddelx_dx); - atomicAdd(&dL_dmean2D[global_id].y, dL_dG * dG_ddely * ddely_dy); - - // Update gradients w.r.t. 2D covariance (2x2 matrix, symmetric) - atomicAdd(&dL_dconic2D[global_id].x, -0.5f * gdx * d.x * dL_dG); - atomicAdd(&dL_dconic2D[global_id].y, -0.5f * gdx * d.y * dL_dG); - atomicAdd(&dL_dconic2D[global_id].w, -0.5f * gdy * d.y * dL_dG); - - // Update gradients w.r.t. opacity of the Gaussian - atomicAdd(&(dL_dopacity[global_id]), G * dL_dalpha); - } - } -} - -void BACKWARD::preprocess( - int P, int D, int M, - const float3* means3D, - const int* radii, - const float* shs, - const bool* clamped, - const glm::vec3* scales, - const glm::vec4* rotations, - const float scale_modifier, - const float* cov3Ds, - const float* viewmatrix, - const float* projmatrix, - const float focal_x, float focal_y, - const float tan_fovx, float tan_fovy, - const glm::vec3* campos, - const float3* dL_dmean2D, - const float* dL_dconic, - glm::vec3* dL_dmean3D, - float* dL_dcolor, - float* dL_dcov3D, - float* dL_dsh, - glm::vec3* dL_dscale, - glm::vec4* dL_drot) -{ - // Propagate gradients for the path of 2D conic matrix computation. - // Somewhat long, thus it is its own kernel rather than being part of - // "preprocess". When done, loss gradient w.r.t. 3D means has been - // modified and gradient w.r.t. 3D covariance matrix has been computed. - computeCov2DCUDA << <(P + 255) / 256, 256 >> > ( - P, - means3D, - radii, - cov3Ds, - focal_x, - focal_y, - tan_fovx, - tan_fovy, - viewmatrix, - dL_dconic, - (float3*)dL_dmean3D, - dL_dcov3D); - - // Propagate gradients for remaining steps: finish 3D mean gradients, - // propagate color gradients to SH (if desireD), propagate 3D covariance - // matrix gradients to scale and rotation. - preprocessCUDA << < (P + 255) / 256, 256 >> > ( - P, D, M, - (float3*)means3D, - radii, - shs, - clamped, - (glm::vec3*)scales, - (glm::vec4*)rotations, - scale_modifier, - projmatrix, - campos, - (float3*)dL_dmean2D, - (glm::vec3*)dL_dmean3D, - dL_dcolor, - dL_dcov3D, - dL_dsh, - dL_dscale, - dL_drot); -} - -void BACKWARD::render( - const dim3 grid, const dim3 block, - const uint2* ranges, - const uint32_t* point_list, - int W, int H, - const float* bg_color, - const float2* means2D, - const float4* conic_opacity, - const float* colors, - const float* final_Ts, - const uint32_t* n_contrib, - const float* dL_dpixels, - float3* dL_dmean2D, - float4* dL_dconic2D, - float* dL_dopacity, - float* dL_dcolors) -{ - renderCUDA << > >( - ranges, - point_list, - W, H, - bg_color, - means2D, - conic_opacity, - colors, - final_Ts, - n_contrib, - dL_dpixels, - dL_dmean2D, - dL_dconic2D, - dL_dopacity, - dL_dcolors - ); -} \ No newline at end of file diff --git a/csrc/reference/cuda_rasterizer/backward.h b/csrc/reference/cuda_rasterizer/backward.h deleted file mode 100644 index 93dd2e4be..000000000 --- a/csrc/reference/cuda_rasterizer/backward.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (C) 2023, Inria - * GRAPHDECO research group, https://team.inria.fr/graphdeco - * All rights reserved. - * - * This software is free for non-commercial, research and evaluation use - * under the terms of the LICENSE.md file. - * - * For inquiries contact george.drettakis@inria.fr - */ - -#ifndef CUDA_RASTERIZER_BACKWARD_H_INCLUDED -#define CUDA_RASTERIZER_BACKWARD_H_INCLUDED - -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" -#define GLM_FORCE_CUDA -#include - -namespace BACKWARD -{ - void render( - const dim3 grid, dim3 block, - const uint2* ranges, - const uint32_t* point_list, - int W, int H, - const float* bg_color, - const float2* means2D, - const float4* conic_opacity, - const float* colors, - const float* final_Ts, - const uint32_t* n_contrib, - const float* dL_dpixels, - float3* dL_dmean2D, - float4* dL_dconic2D, - float* dL_dopacity, - float* dL_dcolors); - - void preprocess( - int P, int D, int M, - const float3* means, - const int* radii, - const float* shs, - const bool* clamped, - const glm::vec3* scales, - const glm::vec4* rotations, - const float scale_modifier, - const float* cov3Ds, - const float* view, - const float* proj, - const float focal_x, float focal_y, - const float tan_fovx, float tan_fovy, - const glm::vec3* campos, - const float3* dL_dmean2D, - const float* dL_dconics, - glm::vec3* dL_dmeans, - float* dL_dcolor, - float* dL_dcov3D, - float* dL_dsh, - glm::vec3* dL_dscale, - glm::vec4* dL_drot); -} - -#endif \ No newline at end of file diff --git a/csrc/reference/cuda_rasterizer/config.h b/csrc/reference/cuda_rasterizer/config.h deleted file mode 100644 index 2a912fb34..000000000 --- a/csrc/reference/cuda_rasterizer/config.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (C) 2023, Inria - * GRAPHDECO research group, https://team.inria.fr/graphdeco - * All rights reserved. - * - * This software is free for non-commercial, research and evaluation use - * under the terms of the LICENSE.md file. - * - * For inquiries contact george.drettakis@inria.fr - */ - -#ifndef CUDA_RASTERIZER_CONFIG_H_INCLUDED -#define CUDA_RASTERIZER_CONFIG_H_INCLUDED - -#define NUM_CHANNELS 3 // Default 3, RGB -#define BLOCK_X 16 -#define BLOCK_Y 16 - -#endif \ No newline at end of file diff --git a/csrc/reference/cuda_rasterizer/forward.cu b/csrc/reference/cuda_rasterizer/forward.cu deleted file mode 100644 index 1f2b1026a..000000000 --- a/csrc/reference/cuda_rasterizer/forward.cu +++ /dev/null @@ -1,477 +0,0 @@ -/* - * Copyright (C) 2023, Inria - * GRAPHDECO research group, https://team.inria.fr/graphdeco - * All rights reserved. - * - * This software is free for non-commercial, research and evaluation use - * under the terms of the LICENSE.md file. - * - * For inquiries contact george.drettakis@inria.fr - */ - -#include "forward.h" -#include "auxiliary.h" -#include -#include -namespace cg = cooperative_groups; - -// Forward method for converting the input spherical harmonics -// coefficients of each Gaussian to a simple RGB color. -__device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped) -{ - // The implementation is loosely based on code for - // "Differentiable Point-Based Radiance Fields for - // Efficient View Synthesis" by Zhang et al. (2022) - glm::vec3 pos = means[idx]; - glm::vec3 dir = pos - campos; - dir = dir / glm::length(dir); - - glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs; - glm::vec3 result = SH_C0 * sh[0]; - - if (deg > 0) - { - float x = dir.x; - float y = dir.y; - float z = dir.z; - result = result - SH_C1 * y * sh[1] + SH_C1 * z * sh[2] - SH_C1 * x * sh[3]; - - if (deg > 1) - { - float xx = x * x, yy = y * y, zz = z * z; - float xy = x * y, yz = y * z, xz = x * z; - result = result + - SH_C2[0] * xy * sh[4] + - SH_C2[1] * yz * sh[5] + - SH_C2[2] * (2.0f * zz - xx - yy) * sh[6] + - SH_C2[3] * xz * sh[7] + - SH_C2[4] * (xx - yy) * sh[8]; - - if (deg > 2) - { - result = result + - SH_C3[0] * y * (3.0f * xx - yy) * sh[9] + - SH_C3[1] * xy * z * sh[10] + - SH_C3[2] * y * (4.0f * zz - xx - yy) * sh[11] + - SH_C3[3] * z * (2.0f * zz - 3.0f * xx - 3.0f * yy) * sh[12] + - SH_C3[4] * x * (4.0f * zz - xx - yy) * sh[13] + - SH_C3[5] * z * (xx - yy) * sh[14] + - SH_C3[6] * x * (xx - 3.0f * yy) * sh[15]; - } - } - } - result += 0.5f; - - // RGB colors are clamped to positive values. If values are - // clamped, we need to keep track of this for the backward pass. - clamped[3 * idx + 0] = (result.x < 0); - clamped[3 * idx + 1] = (result.y < 0); - clamped[3 * idx + 2] = (result.z < 0); - return glm::max(result, 0.0f); -} - -// Forward version of 2D covariance matrix computation -__host__ __device__ float3 computeCov2D(const float3& mean, float focal_x, float focal_y, float tan_fovx, float tan_fovy, const float* cov3D, const float* viewmatrix) -{ - // The following models the steps outlined by equations 29 - // and 31 in "EWA Splatting" (Zwicker et al., 2002). - // Additionally considers aspect / scaling of viewport. - // Transposes used to account for row-/column-major conventions. - float3 t = transformPoint4x3(mean, viewmatrix); - // printf("ref %f %f %f\n", t.x, t.y, t.z); - - const float limx = 1.3f * tan_fovx; - const float limy = 1.3f * tan_fovy; - const float txtz = t.x / t.z; - const float tytz = t.y / t.z; - t.x = min(limx, max(-limx, txtz)) * t.z; - t.y = min(limy, max(-limy, tytz)) * t.z; - // printf("ref clipped %f %f %f\n", t.x, t.y, t.z); - - glm::mat3 J = glm::mat3( - focal_x / t.z, 0.0f, -(focal_x * t.x) / (t.z * t.z), - 0.0f, focal_y / t.z, -(focal_y * t.y) / (t.z * t.z), - 0, 0, 0); - // printf("ref J\n %f %f %f\n %f %f %f\n", - // J[0][0], J[1][0], J[2][0], - // J[0][1], J[1][1], J[2][1] - // ); - - glm::mat3 W = glm::mat3( - viewmatrix[0], viewmatrix[4], viewmatrix[8], - viewmatrix[1], viewmatrix[5], viewmatrix[9], - viewmatrix[2], viewmatrix[6], viewmatrix[10]); - - glm::mat3 T = W * J; - - // printf("ref T\n %f %f %f\n %f %f %f\n %f %f %f\n", - // T[0][0], T[1][0], T[2][0], - // T[0][1], T[1][1], T[2][1], - // T[0][2], T[1][2], T[2][2] - // ); - - glm::mat3 Vrk = glm::mat3( - cov3D[0], cov3D[1], cov3D[2], - cov3D[1], cov3D[3], cov3D[4], - cov3D[2], cov3D[4], cov3D[5]); - - // printf("ref Vrk\n %f %f %f\n %f %f %f\n %f %f %f\n", - // Vrk[0][0], Vrk[1][0], Vrk[2][0], - // Vrk[0][1], Vrk[1][1], Vrk[2][1], - // Vrk[0][2], Vrk[1][2], Vrk[2][2] - // ); - glm::mat3 cov = glm::transpose(T) * glm::transpose(Vrk) * T; - // printf("ref cov\n %f %f %f\n %f %f %f\n %f %f %f\n", - // cov[0][0], cov[1][0], cov[2][0], - // cov[0][1], cov[1][1], cov[2][1], - // cov[0][2], cov[1][2], cov[2][2] - // ); - - // Apply low-pass filter: every Gaussian should be at least - // one pixel wide/high. Discard 3rd row and column. - cov[0][0] += 0.3f; - cov[1][1] += 0.3f; - return { float(cov[0][0]), float(cov[0][1]), float(cov[1][1]) }; -} - -// Forward method for converting scale and rotation properties of each -// Gaussian to a 3D covariance matrix in world space. Also takes care -// of quaternion normalization. -__host__ __device__ void computeCov3D(const glm::vec3 scale, float mod, const glm::vec4 rot, float* cov3D) -{ - // Create scaling matrix - glm::mat3 S = glm::mat3(1.0f); - S[0][0] = mod * scale.x; - S[1][1] = mod * scale.y; - S[2][2] = mod * scale.z; - - // Normalize quaternion to get valid rotation - glm::vec4 q = rot;// / glm::length(rot); - float r = q.x; - float x = q.y; - float y = q.z; - float z = q.w; - - // Compute rotation matrix from quaternion - glm::mat3 R = glm::mat3( - 1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y), - 2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x), - 2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y) - ); - - glm::mat3 M = S * R; - - // Compute 3D world covariance matrix Sigma - glm::mat3 Sigma = glm::transpose(M) * M; - - // Covariance is symmetric, only store upper right - cov3D[0] = Sigma[0][0]; - cov3D[1] = Sigma[0][1]; - cov3D[2] = Sigma[0][2]; - cov3D[3] = Sigma[1][1]; - cov3D[4] = Sigma[1][2]; - cov3D[5] = Sigma[2][2]; -} - -// Perform initial steps for each Gaussian prior to rasterization. -template -__global__ void preprocessCUDA(int P, int D, int M, - const float* orig_points, - const glm::vec3* scales, - const float scale_modifier, - const glm::vec4* rotations, - const float* opacities, - const float* shs, - bool* clamped, - const float* cov3D_precomp, - const float* colors_precomp, - const float* viewmatrix, - const float* projmatrix, - const glm::vec3* cam_pos, - const int W, int H, - const float tan_fovx, float tan_fovy, - const float focal_x, float focal_y, - int* radii, - float2* points_xy_image, - float* depths, - float* cov3Ds, - float* rgb, - float4* conic_opacity, - const dim3 grid, - uint32_t* tiles_touched, - bool prefiltered) -{ - auto idx = cg::this_grid().thread_rank(); - if (idx >= P) - return; - - // Initialize radius and touched tiles to 0. If this isn't changed, - // this Gaussian will not be processed further. - radii[idx] = 0; - tiles_touched[idx] = 0; - - // Perform near culling, quit if outside. - float3 p_view; - if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view)) - return; - - // Transform point by projecting - float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] }; - float4 p_hom = transformPoint4x4(p_orig, projmatrix); - float p_w = 1.0f / (p_hom.w + 0.0000001f); - float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w }; - - // If 3D covariance matrix is precomputed, use it, otherwise compute - // from scaling and rotation parameters. - const float* cov3D; - if (cov3D_precomp != nullptr) - { - cov3D = cov3D_precomp + idx * 6; - } - else - { - computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6); - cov3D = cov3Ds + idx * 6; - } - - // Compute 2D screen-space covariance matrix - float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D, viewmatrix); - - // Invert covariance (EWA algorithm) - float det = (cov.x * cov.z - cov.y * cov.y); - if (det == 0.0f) - return; - float det_inv = 1.f / det; - float3 conic = { cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv }; - - // Compute extent in screen space (by finding eigenvalues of - // 2D covariance matrix). Use extent to compute a bounding rectangle - // of screen-space tiles that this Gaussian overlaps with. Quit if - // rectangle covers 0 tiles. - float mid = 0.5f * (cov.x + cov.z); - float lambda1 = mid + sqrt(max(0.1f, mid * mid - det)); - float lambda2 = mid - sqrt(max(0.1f, mid * mid - det)); - float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2))); - float2 point_image = { ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H) }; - uint2 rect_min, rect_max; - getRect(point_image, my_radius, rect_min, rect_max, grid); - if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0) - return; - - // If colors have been precomputed, use them, otherwise convert - // spherical harmonics coefficients to RGB color. - if (colors_precomp == nullptr) - { - glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped); - rgb[idx * C + 0] = result.x; - rgb[idx * C + 1] = result.y; - rgb[idx * C + 2] = result.z; - } - - // Store some useful helper data for the next steps. - depths[idx] = p_view.z; - radii[idx] = my_radius; - points_xy_image[idx] = point_image; - // Inverse 2D covariance and opacity neatly pack into one float4 - conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] }; - tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x); -} - -// Main rasterization method. Collaboratively works on one tile per -// block, each thread treats one pixel. Alternates between fetching -// and rasterizing data. -template -__global__ void __launch_bounds__(BLOCK_X * BLOCK_Y) -renderCUDA( - const uint2* __restrict__ ranges, - const uint32_t* __restrict__ point_list, - int W, int H, - const float2* __restrict__ points_xy_image, - const float* __restrict__ features, - const float4* __restrict__ conic_opacity, - float* __restrict__ final_T, - uint32_t* __restrict__ n_contrib, - const float* __restrict__ bg_color, - float* __restrict__ out_color) -{ - // Identify current tile and associated min/max pixel range. - auto block = cg::this_thread_block(); - uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X; - uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y }; - uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) }; - uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y }; - uint32_t pix_id = W * pix.y + pix.x; - float2 pixf = { (float)pix.x, (float)pix.y }; - - // Check if this thread is associated with a valid pixel or outside. - bool inside = pix.x < W&& pix.y < H; - // Done threads can help with fetching, but don't rasterize - bool done = !inside; - - // Load start/end range of IDs to process in bit sorted list. - uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x]; - const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE); - int toDo = range.y - range.x; - - // Allocate storage for batches of collectively fetched data. - __shared__ int collected_id[BLOCK_SIZE]; - __shared__ float2 collected_xy[BLOCK_SIZE]; - __shared__ float4 collected_conic_opacity[BLOCK_SIZE]; - - // Initialize helper variables - float T = 1.0f; - uint32_t contributor = 0; - uint32_t last_contributor = 0; - float C[CHANNELS] = { 0 }; - - // Iterate over batches until all done or range is complete - for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE) - { - // End if entire block votes that it is done rasterizing - int num_done = __syncthreads_count(done); - if (num_done == BLOCK_SIZE) - break; - - // Collectively fetch per-Gaussian data from global to shared - int progress = i * BLOCK_SIZE + block.thread_rank(); - if (range.x + progress < range.y) - { - int coll_id = point_list[range.x + progress]; - collected_id[block.thread_rank()] = coll_id; - collected_xy[block.thread_rank()] = points_xy_image[coll_id]; - collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id]; - } - block.sync(); - - // Iterate over current batch - for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++) - { - // Keep track of current position in range - contributor++; - - // Resample using conic matrix (cf. "Surface - // Splatting" by Zwicker et al., 2001) - float2 xy = collected_xy[j]; - float2 d = { xy.x - pixf.x, xy.y - pixf.y }; - float4 con_o = collected_conic_opacity[j]; - float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y; - if (power > 0.0f) - continue; - - // Eq. (2) from 3D Gaussian splatting paper. - // Obtain alpha by multiplying with Gaussian opacity - // and its exponential falloff from mean. - // Avoid numerical instabilities (see paper appendix). - float alpha = min(0.99f, con_o.w * exp(power)); - if (alpha < 1.0f / 255.0f) - continue; - float test_T = T * (1 - alpha); - if (test_T < 0.0001f) - { - done = true; - continue; - } - - // Eq. (3) from 3D Gaussian splatting paper. - for (int ch = 0; ch < CHANNELS; ch++) - C[ch] += features[collected_id[j] * CHANNELS + ch] * alpha * T; - - T = test_T; - - // Keep track of last range entry to update this - // pixel. - last_contributor = contributor; - } - } - - // All threads that treat valid pixel write out their final - // rendering data to the frame and auxiliary buffers. - if (inside) - { - final_T[pix_id] = T; - n_contrib[pix_id] = last_contributor; - for (int ch = 0; ch < CHANNELS; ch++) - out_color[ch * H * W + pix_id] = C[ch] + T * bg_color[ch]; - } -} - -void FORWARD::render( - const dim3 grid, dim3 block, - const uint2* ranges, - const uint32_t* point_list, - int W, int H, - const float2* means2D, - const float* colors, - const float4* conic_opacity, - float* final_T, - uint32_t* n_contrib, - const float* bg_color, - float* out_color) -{ - renderCUDA << > > ( - ranges, - point_list, - W, H, - means2D, - colors, - conic_opacity, - final_T, - n_contrib, - bg_color, - out_color); -} - -void FORWARD::preprocess(int P, int D, int M, - const float* means3D, - const glm::vec3* scales, - const float scale_modifier, - const glm::vec4* rotations, - const float* opacities, - const float* shs, - bool* clamped, - const float* cov3D_precomp, - const float* colors_precomp, - const float* viewmatrix, - const float* projmatrix, - const glm::vec3* cam_pos, - const int W, int H, - const float focal_x, float focal_y, - const float tan_fovx, float tan_fovy, - int* radii, - float2* means2D, - float* depths, - float* cov3Ds, - float* rgb, - float4* conic_opacity, - const dim3 grid, - uint32_t* tiles_touched, - bool prefiltered) -{ - preprocessCUDA << <(P + 255) / 256, 256 >> > ( - P, D, M, - means3D, - scales, - scale_modifier, - rotations, - opacities, - shs, - clamped, - cov3D_precomp, - colors_precomp, - viewmatrix, - projmatrix, - cam_pos, - W, H, - tan_fovx, tan_fovy, - focal_x, focal_y, - radii, - means2D, - depths, - cov3Ds, - rgb, - conic_opacity, - grid, - tiles_touched, - prefiltered - ); -} diff --git a/csrc/reference/cuda_rasterizer/forward.h b/csrc/reference/cuda_rasterizer/forward.h deleted file mode 100644 index 3c11cb917..000000000 --- a/csrc/reference/cuda_rasterizer/forward.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (C) 2023, Inria - * GRAPHDECO research group, https://team.inria.fr/graphdeco - * All rights reserved. - * - * This software is free for non-commercial, research and evaluation use - * under the terms of the LICENSE.md file. - * - * For inquiries contact george.drettakis@inria.fr - */ - -#ifndef CUDA_RASTERIZER_FORWARD_H_INCLUDED -#define CUDA_RASTERIZER_FORWARD_H_INCLUDED - -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" -#define GLM_FORCE_CUDA -#include - -namespace FORWARD -{ - // Perform initial steps for each Gaussian prior to rasterization. - void preprocess(int P, int D, int M, - const float* orig_points, - const glm::vec3* scales, - const float scale_modifier, - const glm::vec4* rotations, - const float* opacities, - const float* shs, - bool* clamped, - const float* cov3D_precomp, - const float* colors_precomp, - const float* viewmatrix, - const float* projmatrix, - const glm::vec3* cam_pos, - const int W, int H, - const float focal_x, float focal_y, - const float tan_fovx, float tan_fovy, - int* radii, - float2* points_xy_image, - float* depths, - float* cov3Ds, - float* colors, - float4* conic_opacity, - const dim3 grid, - uint32_t* tiles_touched, - bool prefiltered); - - // Main rasterization method. - void render( - const dim3 grid, dim3 block, - const uint2* ranges, - const uint32_t* point_list, - int W, int H, - const float2* points_xy_image, - const float* features, - const float4* conic_opacity, - float* final_T, - uint32_t* n_contrib, - const float* bg_color, - float* out_color); -} - - -#endif \ No newline at end of file diff --git a/csrc/reference/cuda_rasterizer/rasterizer.h b/csrc/reference/cuda_rasterizer/rasterizer.h deleted file mode 100644 index 81544ef61..000000000 --- a/csrc/reference/cuda_rasterizer/rasterizer.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (C) 2023, Inria - * GRAPHDECO research group, https://team.inria.fr/graphdeco - * All rights reserved. - * - * This software is free for non-commercial, research and evaluation use - * under the terms of the LICENSE.md file. - * - * For inquiries contact george.drettakis@inria.fr - */ - -#ifndef CUDA_RASTERIZER_H_INCLUDED -#define CUDA_RASTERIZER_H_INCLUDED - -#include -#include - -namespace CudaRasterizer -{ - class Rasterizer - { - public: - - static void markVisible( - int P, - float* means3D, - float* viewmatrix, - float* projmatrix, - bool* present); - - static int forward( - std::function geometryBuffer, - std::function binningBuffer, - std::function imageBuffer, - const int P, int D, int M, - const float* background, - const int width, int height, - const float* means3D, - const float* shs, - const float* colors_precomp, - const float* opacities, - const float* scales, - const float scale_modifier, - const float* rotations, - const float* cov3D_precomp, - const float* viewmatrix, - const float* projmatrix, - const float* cam_pos, - const float tan_fovx, float tan_fovy, - const bool prefiltered, - float* out_color, - int* radii = nullptr, - bool debug = false); - - static void backward( - const int P, int D, int M, int R, - const float* background, - const int width, int height, - const float* means3D, - const float* shs, - const float* colors_precomp, - const float* scales, - const float scale_modifier, - const float* rotations, - const float* cov3D_precomp, - const float* viewmatrix, - const float* projmatrix, - const float* campos, - const float tan_fovx, float tan_fovy, - const int* radii, - char* geom_buffer, - char* binning_buffer, - char* image_buffer, - const float* dL_dpix, - float* dL_dmean2D, - float* dL_dconic, - float* dL_dopacity, - float* dL_dcolor, - float* dL_dmean3D, - float* dL_dcov3D, - float* dL_dsh, - float* dL_dscale, - float* dL_drot, - bool debug); - }; -}; - -#endif \ No newline at end of file diff --git a/csrc/reference/cuda_rasterizer/rasterizer_impl.cu b/csrc/reference/cuda_rasterizer/rasterizer_impl.cu deleted file mode 100644 index f8782ac43..000000000 --- a/csrc/reference/cuda_rasterizer/rasterizer_impl.cu +++ /dev/null @@ -1,434 +0,0 @@ -/* - * Copyright (C) 2023, Inria - * GRAPHDECO research group, https://team.inria.fr/graphdeco - * All rights reserved. - * - * This software is free for non-commercial, research and evaluation use - * under the terms of the LICENSE.md file. - * - * For inquiries contact george.drettakis@inria.fr - */ - -#include "rasterizer_impl.h" -#include -#include -#include -#include -#include -#include "cuda_runtime.h" -#include "device_launch_parameters.h" -#include -#include -#define GLM_FORCE_CUDA -#include - -#include -#include -namespace cg = cooperative_groups; - -#include "auxiliary.h" -#include "forward.h" -#include "backward.h" - -// Helper function to find the next-highest bit of the MSB -// on the CPU. -uint32_t getHigherMsb(uint32_t n) -{ - uint32_t msb = sizeof(n) * 4; - uint32_t step = msb; - while (step > 1) - { - step /= 2; - if (n >> msb) - msb += step; - else - msb -= step; - } - if (n >> msb) - msb++; - return msb; -} - -// Wrapper method to call auxiliary coarse frustum containment test. -// Mark all Gaussians that pass it. -__global__ void checkFrustum(int P, - const float* orig_points, - const float* viewmatrix, - const float* projmatrix, - bool* present) -{ - auto idx = cg::this_grid().thread_rank(); - if (idx >= P) - return; - - float3 p_view; - present[idx] = in_frustum(idx, orig_points, viewmatrix, projmatrix, false, p_view); -} - -// Generates one key/value pair for all Gaussian / tile overlaps. -// Run once per Gaussian (1:N mapping). -__global__ void duplicateWithKeys( - int P, - const float2* points_xy, - const float* depths, - const uint32_t* offsets, - uint64_t* gaussian_keys_unsorted, - uint32_t* gaussian_values_unsorted, - int* radii, - dim3 grid) -{ - auto idx = cg::this_grid().thread_rank(); - if (idx >= P) - return; - - // Generate no key/value pair for invisible Gaussians - if (radii[idx] > 0) - { - // Find this Gaussian's offset in buffer for writing keys/values. - uint32_t off = (idx == 0) ? 0 : offsets[idx - 1]; - uint2 rect_min, rect_max; - - getRect(points_xy[idx], radii[idx], rect_min, rect_max, grid); - - // For each tile that the bounding rect overlaps, emit a - // key/value pair. The key is | tile ID | depth |, - // and the value is the ID of the Gaussian. Sorting the values - // with this key yields Gaussian IDs in a list, such that they - // are first sorted by tile and then by depth. - for (int y = rect_min.y; y < rect_max.y; y++) - { - for (int x = rect_min.x; x < rect_max.x; x++) - { - uint64_t key = y * grid.x + x; - key <<= 32; - key |= *((uint32_t*)&depths[idx]); - gaussian_keys_unsorted[off] = key; - gaussian_values_unsorted[off] = idx; - off++; - } - } - } -} - -// Check keys to see if it is at the start/end of one tile's range in -// the full sorted list. If yes, write start/end of this tile. -// Run once per instanced (duplicated) Gaussian ID. -__global__ void identifyTileRanges(int L, uint64_t* point_list_keys, uint2* ranges) -{ - auto idx = cg::this_grid().thread_rank(); - if (idx >= L) - return; - - // Read tile ID from key. Update start/end of tile range if at limit. - uint64_t key = point_list_keys[idx]; - uint32_t currtile = key >> 32; - if (idx == 0) - ranges[currtile].x = 0; - else - { - uint32_t prevtile = point_list_keys[idx - 1] >> 32; - if (currtile != prevtile) - { - ranges[prevtile].y = idx; - ranges[currtile].x = idx; - } - } - if (idx == L - 1) - ranges[currtile].y = L; -} - -// Mark Gaussians as visible/invisible, based on view frustum testing -void CudaRasterizer::Rasterizer::markVisible( - int P, - float* means3D, - float* viewmatrix, - float* projmatrix, - bool* present) -{ - checkFrustum << <(P + 255) / 256, 256 >> > ( - P, - means3D, - viewmatrix, projmatrix, - present); -} - -CudaRasterizer::GeometryState CudaRasterizer::GeometryState::fromChunk(char*& chunk, size_t P) -{ - GeometryState geom; - obtain(chunk, geom.depths, P, 128); - obtain(chunk, geom.clamped, P * 3, 128); - obtain(chunk, geom.internal_radii, P, 128); - obtain(chunk, geom.means2D, P, 128); - obtain(chunk, geom.cov3D, P * 6, 128); - obtain(chunk, geom.conic_opacity, P, 128); - obtain(chunk, geom.rgb, P * 3, 128); - obtain(chunk, geom.tiles_touched, P, 128); - cub::DeviceScan::InclusiveSum(nullptr, geom.scan_size, geom.tiles_touched, geom.tiles_touched, P); - obtain(chunk, geom.scanning_space, geom.scan_size, 128); - obtain(chunk, geom.point_offsets, P, 128); - return geom; -} - -CudaRasterizer::ImageState CudaRasterizer::ImageState::fromChunk(char*& chunk, size_t N) -{ - ImageState img; - obtain(chunk, img.accum_alpha, N, 128); - obtain(chunk, img.n_contrib, N, 128); - obtain(chunk, img.ranges, N, 128); - return img; -} - -CudaRasterizer::BinningState CudaRasterizer::BinningState::fromChunk(char*& chunk, size_t P) -{ - BinningState binning; - obtain(chunk, binning.point_list, P, 128); - obtain(chunk, binning.point_list_unsorted, P, 128); - obtain(chunk, binning.point_list_keys, P, 128); - obtain(chunk, binning.point_list_keys_unsorted, P, 128); - cub::DeviceRadixSort::SortPairs( - nullptr, binning.sorting_size, - binning.point_list_keys_unsorted, binning.point_list_keys, - binning.point_list_unsorted, binning.point_list, P); - obtain(chunk, binning.list_sorting_space, binning.sorting_size, 128); - return binning; -} - -// Forward rendering procedure for differentiable rasterization -// of Gaussians. -int CudaRasterizer::Rasterizer::forward( - std::function geometryBuffer, - std::function binningBuffer, - std::function imageBuffer, - const int P, int D, int M, - const float* background, - const int width, int height, - const float* means3D, - const float* shs, - const float* colors_precomp, - const float* opacities, - const float* scales, - const float scale_modifier, - const float* rotations, - const float* cov3D_precomp, - const float* viewmatrix, - const float* projmatrix, - const float* cam_pos, - const float tan_fovx, float tan_fovy, - const bool prefiltered, - float* out_color, - int* radii, - bool debug) -{ - const float focal_y = height / (2.0f * tan_fovy); - const float focal_x = width / (2.0f * tan_fovx); - - size_t chunk_size = required(P); - char* chunkptr = geometryBuffer(chunk_size); - GeometryState geomState = GeometryState::fromChunk(chunkptr, P); - - if (radii == nullptr) - { - radii = geomState.internal_radii; - } - - dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1); - dim3 block(BLOCK_X, BLOCK_Y, 1); - - // Dynamically resize image-based auxiliary buffers during training - size_t img_chunk_size = required(width * height); - char* img_chunkptr = imageBuffer(img_chunk_size); - ImageState imgState = ImageState::fromChunk(img_chunkptr, width * height); - - if (NUM_CHANNELS != 3 && colors_precomp == nullptr) - { - throw std::runtime_error("For non-RGB, provide precomputed Gaussian colors!"); - } - - // Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB) - CHECK_CUDA(FORWARD::preprocess( - P, D, M, - means3D, - (glm::vec3*)scales, - scale_modifier, - (glm::vec4*)rotations, - opacities, - shs, - geomState.clamped, - cov3D_precomp, - colors_precomp, - viewmatrix, projmatrix, - (glm::vec3*)cam_pos, - width, height, - focal_x, focal_y, - tan_fovx, tan_fovy, - radii, - geomState.means2D, - geomState.depths, - geomState.cov3D, - geomState.rgb, - geomState.conic_opacity, - tile_grid, - geomState.tiles_touched, - prefiltered - ), debug) - - // Compute prefix sum over full list of touched tile counts by Gaussians - // E.g., [2, 3, 0, 2, 1] -> [2, 5, 5, 7, 8] - CHECK_CUDA(cub::DeviceScan::InclusiveSum(geomState.scanning_space, geomState.scan_size, geomState.tiles_touched, geomState.point_offsets, P), debug) - - // Retrieve total number of Gaussian instances to launch and resize aux buffers - int num_rendered; - CHECK_CUDA(cudaMemcpy(&num_rendered, geomState.point_offsets + P - 1, sizeof(int), cudaMemcpyDeviceToHost), debug); - - size_t binning_chunk_size = required(num_rendered); - char* binning_chunkptr = binningBuffer(binning_chunk_size); - BinningState binningState = BinningState::fromChunk(binning_chunkptr, num_rendered); - - // For each instance to be rendered, produce adequate [ tile | depth ] key - // and corresponding dublicated Gaussian indices to be sorted - duplicateWithKeys << <(P + 255) / 256, 256 >> > ( - P, - geomState.means2D, - geomState.depths, - geomState.point_offsets, - binningState.point_list_keys_unsorted, - binningState.point_list_unsorted, - radii, - tile_grid) - CHECK_CUDA(, debug) - - int bit = getHigherMsb(tile_grid.x * tile_grid.y); - - // Sort complete list of (duplicated) Gaussian indices by keys - CHECK_CUDA(cub::DeviceRadixSort::SortPairs( - binningState.list_sorting_space, - binningState.sorting_size, - binningState.point_list_keys_unsorted, binningState.point_list_keys, - binningState.point_list_unsorted, binningState.point_list, - num_rendered, 0, 32 + bit), debug) - - CHECK_CUDA(cudaMemset(imgState.ranges, 0, tile_grid.x * tile_grid.y * sizeof(uint2)), debug); - - // Identify start and end of per-tile workloads in sorted list - if (num_rendered > 0) - identifyTileRanges << <(num_rendered + 255) / 256, 256 >> > ( - num_rendered, - binningState.point_list_keys, - imgState.ranges); - CHECK_CUDA(, debug) - - // Let each tile blend its range of Gaussians independently in parallel - const float* feature_ptr = colors_precomp != nullptr ? colors_precomp : geomState.rgb; - CHECK_CUDA(FORWARD::render( - tile_grid, block, - imgState.ranges, - binningState.point_list, - width, height, - geomState.means2D, - feature_ptr, - geomState.conic_opacity, - imgState.accum_alpha, - imgState.n_contrib, - background, - out_color), debug) - - return num_rendered; -} - -// Produce necessary gradients for optimization, corresponding -// to forward render pass -void CudaRasterizer::Rasterizer::backward( - const int P, int D, int M, int R, - const float* background, - const int width, int height, - const float* means3D, - const float* shs, - const float* colors_precomp, - const float* scales, - const float scale_modifier, - const float* rotations, - const float* cov3D_precomp, - const float* viewmatrix, - const float* projmatrix, - const float* campos, - const float tan_fovx, float tan_fovy, - const int* radii, - char* geom_buffer, - char* binning_buffer, - char* img_buffer, - const float* dL_dpix, - float* dL_dmean2D, - float* dL_dconic, - float* dL_dopacity, - float* dL_dcolor, - float* dL_dmean3D, - float* dL_dcov3D, - float* dL_dsh, - float* dL_dscale, - float* dL_drot, - bool debug) -{ - GeometryState geomState = GeometryState::fromChunk(geom_buffer, P); - BinningState binningState = BinningState::fromChunk(binning_buffer, R); - ImageState imgState = ImageState::fromChunk(img_buffer, width * height); - - if (radii == nullptr) - { - radii = geomState.internal_radii; - } - - const float focal_y = height / (2.0f * tan_fovy); - const float focal_x = width / (2.0f * tan_fovx); - - const dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1); - const dim3 block(BLOCK_X, BLOCK_Y, 1); - - // Compute loss gradients w.r.t. 2D mean position, conic matrix, - // opacity and RGB of Gaussians from per-pixel loss gradients. - // If we were given precomputed colors and not SHs, use them. - const float* color_ptr = (colors_precomp != nullptr) ? colors_precomp : geomState.rgb; - CHECK_CUDA(BACKWARD::render( - tile_grid, - block, - imgState.ranges, - binningState.point_list, - width, height, - background, - geomState.means2D, - geomState.conic_opacity, - color_ptr, - imgState.accum_alpha, - imgState.n_contrib, - dL_dpix, - (float3*)dL_dmean2D, - (float4*)dL_dconic, - dL_dopacity, - dL_dcolor), debug) - - // Take care of the rest of preprocessing. Was the precomputed covariance - // given to us or a scales/rot pair? If precomputed, pass that. If not, - // use the one we computed ourselves. - const float* cov3D_ptr = (cov3D_precomp != nullptr) ? cov3D_precomp : geomState.cov3D; - CHECK_CUDA(BACKWARD::preprocess(P, D, M, - (float3*)means3D, - radii, - shs, - geomState.clamped, - (glm::vec3*)scales, - (glm::vec4*)rotations, - scale_modifier, - cov3D_ptr, - viewmatrix, - projmatrix, - focal_x, focal_y, - tan_fovx, tan_fovy, - (glm::vec3*)campos, - (float3*)dL_dmean2D, - dL_dconic, - (glm::vec3*)dL_dmean3D, - dL_dcolor, - dL_dcov3D, - dL_dsh, - (glm::vec3*)dL_dscale, - (glm::vec4*)dL_drot), debug) -} \ No newline at end of file diff --git a/csrc/reference/cuda_rasterizer/rasterizer_impl.h b/csrc/reference/cuda_rasterizer/rasterizer_impl.h deleted file mode 100644 index bc3f0ece7..000000000 --- a/csrc/reference/cuda_rasterizer/rasterizer_impl.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (C) 2023, Inria - * GRAPHDECO research group, https://team.inria.fr/graphdeco - * All rights reserved. - * - * This software is free for non-commercial, research and evaluation use - * under the terms of the LICENSE.md file. - * - * For inquiries contact george.drettakis@inria.fr - */ - -#pragma once - -#include -#include -#include "rasterizer.h" -#include - -namespace CudaRasterizer -{ - template - static void obtain(char*& chunk, T*& ptr, std::size_t count, std::size_t alignment) - { - std::size_t offset = (reinterpret_cast(chunk) + alignment - 1) & ~(alignment - 1); - ptr = reinterpret_cast(offset); - chunk = reinterpret_cast(ptr + count); - } - - struct GeometryState - { - size_t scan_size; - float* depths; - char* scanning_space; - bool* clamped; - int* internal_radii; - float2* means2D; - float* cov3D; - float4* conic_opacity; - float* rgb; - uint32_t* point_offsets; - uint32_t* tiles_touched; - - static GeometryState fromChunk(char*& chunk, size_t P); - }; - - struct ImageState - { - uint2* ranges; - uint32_t* n_contrib; - float* accum_alpha; - - static ImageState fromChunk(char*& chunk, size_t N); - }; - - struct BinningState - { - size_t sorting_size; - uint64_t* point_list_keys_unsorted; - uint64_t* point_list_keys; - uint32_t* point_list_unsorted; - uint32_t* point_list; - char* list_sorting_space; - - static BinningState fromChunk(char*& chunk, size_t P); - }; - - template - size_t required(size_t P) - { - char* size = nullptr; - T::fromChunk(size, P); - return ((size_t)size) + 128; - } -}; \ No newline at end of file diff --git a/csrc/reference/ext.cpp b/csrc/reference/ext.cpp deleted file mode 100644 index 2c4a771d6..000000000 --- a/csrc/reference/ext.cpp +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (C) 2023, Inria - * GRAPHDECO research group, https://team.inria.fr/graphdeco - * All rights reserved. - * - * This software is free for non-commercial, research and evaluation use - * under the terms of the LICENSE.md file. - * - * For inquiries contact george.drettakis@inria.fr - */ - -#include -#include "rasterize_points.h" - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("rasterize_gaussians", &RasterizeGaussiansCUDA); - m.def("rasterize_gaussians_backward", &RasterizeGaussiansBackwardCUDA); - m.def("mark_visible", &markVisible); -} diff --git a/csrc/reference/rasterize_points.cu b/csrc/reference/rasterize_points.cu deleted file mode 100644 index fceef53c3..000000000 --- a/csrc/reference/rasterize_points.cu +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Copyright (C) 2023, Inria - * GRAPHDECO research group, https://team.inria.fr/graphdeco - * All rights reserved. - * - * This software is free for non-commercial, research and evaluation use - * under the terms of the LICENSE.md file. - * - * For inquiries contact george.drettakis@inria.fr - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cuda_rasterizer/config.h" -#include "cuda_rasterizer/rasterizer.h" - - -std::function resizeFunctional(torch::Tensor& t) { - auto lambda = [&t](size_t N) { - t.resize_({(long long)N}); - return reinterpret_cast(t.contiguous().data_ptr()); - }; - return lambda; -} - -std::tuple -RasterizeGaussiansCUDA( - const torch::Tensor& background, - const torch::Tensor& means3D, - const torch::Tensor& colors, - const torch::Tensor& opacity, - const torch::Tensor& scales, - const torch::Tensor& rotations, - const float scale_modifier, - const torch::Tensor& cov3D_precomp, - const torch::Tensor& viewmatrix, - const torch::Tensor& projmatrix, - const float tan_fovx, - const float tan_fovy, - const int image_height, - const int image_width, - const torch::Tensor& sh, - const int degree, - const torch::Tensor& campos, - const bool prefiltered, - const bool debug) -{ - if (means3D.ndimension() != 2 || means3D.size(1) != 3) { - AT_ERROR("means3D must have dimensions (num_points, 3)"); - } - - const int P = means3D.size(0); - const int H = image_height; - const int W = image_width; - - auto int_opts = means3D.options().dtype(torch::kInt32); - auto float_opts = means3D.options().dtype(torch::kFloat32); - - torch::Tensor out_color = torch::full({NUM_CHANNELS, H, W}, 0.0, float_opts); - torch::Tensor radii = torch::full({P}, 0, means3D.options().dtype(torch::kInt32)); - - torch::Device device(torch::kCUDA); - torch::TensorOptions options(torch::kByte); - torch::Tensor geomBuffer = torch::empty({0}, options.device(device)); - torch::Tensor binningBuffer = torch::empty({0}, options.device(device)); - torch::Tensor imgBuffer = torch::empty({0}, options.device(device)); - std::function geomFunc = resizeFunctional(geomBuffer); - std::function binningFunc = resizeFunctional(binningBuffer); - std::function imgFunc = resizeFunctional(imgBuffer); - - int rendered = 0; - if(P != 0) - { - int M = 0; - if(sh.size(0) != 0) - { - M = sh.size(1); - } - - rendered = CudaRasterizer::Rasterizer::forward( - geomFunc, - binningFunc, - imgFunc, - P, degree, M, - background.contiguous().data(), - W, H, - means3D.contiguous().data(), - sh.contiguous().data_ptr(), - colors.contiguous().data(), - opacity.contiguous().data(), - scales.contiguous().data_ptr(), - scale_modifier, - rotations.contiguous().data_ptr(), - cov3D_precomp.contiguous().data(), - viewmatrix.contiguous().data(), - projmatrix.contiguous().data(), - campos.contiguous().data(), - tan_fovx, - tan_fovy, - prefiltered, - out_color.contiguous().data(), - radii.contiguous().data(), - debug); - } - return std::make_tuple(rendered, out_color, radii, geomBuffer, binningBuffer, imgBuffer); -} - -std::tuple - RasterizeGaussiansBackwardCUDA( - const torch::Tensor& background, - const torch::Tensor& means3D, - const torch::Tensor& radii, - const torch::Tensor& colors, - const torch::Tensor& scales, - const torch::Tensor& rotations, - const float scale_modifier, - const torch::Tensor& cov3D_precomp, - const torch::Tensor& viewmatrix, - const torch::Tensor& projmatrix, - const float tan_fovx, - const float tan_fovy, - const torch::Tensor& dL_dout_color, - const torch::Tensor& sh, - const int degree, - const torch::Tensor& campos, - const torch::Tensor& geomBuffer, - const int R, - const torch::Tensor& binningBuffer, - const torch::Tensor& imageBuffer, - const bool debug) -{ - const int P = means3D.size(0); - const int H = dL_dout_color.size(1); - const int W = dL_dout_color.size(2); - - int M = 0; - if(sh.size(0) != 0) - { - M = sh.size(1); - } - - torch::Tensor dL_dmeans3D = torch::zeros({P, 3}, means3D.options()); - torch::Tensor dL_dmeans2D = torch::zeros({P, 3}, means3D.options()); - torch::Tensor dL_dcolors = torch::zeros({P, NUM_CHANNELS}, means3D.options()); - torch::Tensor dL_dconic = torch::zeros({P, 2, 2}, means3D.options()); - torch::Tensor dL_dopacity = torch::zeros({P, 1}, means3D.options()); - torch::Tensor dL_dcov3D = torch::zeros({P, 6}, means3D.options()); - torch::Tensor dL_dsh = torch::zeros({P, M, 3}, means3D.options()); - torch::Tensor dL_dscales = torch::zeros({P, 3}, means3D.options()); - torch::Tensor dL_drotations = torch::zeros({P, 4}, means3D.options()); - - if(P != 0) - { - CudaRasterizer::Rasterizer::backward(P, degree, M, R, - background.contiguous().data(), - W, H, - means3D.contiguous().data(), - sh.contiguous().data(), - colors.contiguous().data(), - scales.data_ptr(), - scale_modifier, - rotations.data_ptr(), - cov3D_precomp.contiguous().data(), - viewmatrix.contiguous().data(), - projmatrix.contiguous().data(), - campos.contiguous().data(), - tan_fovx, - tan_fovy, - radii.contiguous().data(), - reinterpret_cast(geomBuffer.contiguous().data_ptr()), - reinterpret_cast(binningBuffer.contiguous().data_ptr()), - reinterpret_cast(imageBuffer.contiguous().data_ptr()), - dL_dout_color.contiguous().data(), - dL_dmeans2D.contiguous().data(), - dL_dconic.contiguous().data(), - dL_dopacity.contiguous().data(), - dL_dcolors.contiguous().data(), - dL_dmeans3D.contiguous().data(), - dL_dcov3D.contiguous().data(), - dL_dsh.contiguous().data(), - dL_dscales.contiguous().data(), - dL_drotations.contiguous().data(), - debug); - } - - return std::make_tuple(dL_dmeans2D, dL_dcolors, dL_dopacity, dL_dmeans3D, dL_dcov3D, dL_dsh, dL_dscales, dL_drotations); -} - -torch::Tensor markVisible( - torch::Tensor& means3D, - torch::Tensor& viewmatrix, - torch::Tensor& projmatrix) -{ - const int P = means3D.size(0); - - torch::Tensor present = torch::full({P}, false, means3D.options().dtype(at::kBool)); - - if(P != 0) - { - CudaRasterizer::Rasterizer::markVisible(P, - means3D.contiguous().data(), - viewmatrix.contiguous().data(), - projmatrix.contiguous().data(), - present.contiguous().data()); - } - - return present; -} diff --git a/csrc/reference/rasterize_points.h b/csrc/reference/rasterize_points.h deleted file mode 100644 index 9023d994b..000000000 --- a/csrc/reference/rasterize_points.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (C) 2023, Inria - * GRAPHDECO research group, https://team.inria.fr/graphdeco - * All rights reserved. - * - * This software is free for non-commercial, research and evaluation use - * under the terms of the LICENSE.md file. - * - * For inquiries contact george.drettakis@inria.fr - */ - -#pragma once -#include -#include -#include -#include - -std::tuple -RasterizeGaussiansCUDA( - const torch::Tensor& background, - const torch::Tensor& means3D, - const torch::Tensor& colors, - const torch::Tensor& opacity, - const torch::Tensor& scales, - const torch::Tensor& rotations, - const float scale_modifier, - const torch::Tensor& cov3D_precomp, - const torch::Tensor& viewmatrix, - const torch::Tensor& projmatrix, - const float tan_fovx, - const float tan_fovy, - const int image_height, - const int image_width, - const torch::Tensor& sh, - const int degree, - const torch::Tensor& campos, - const bool prefiltered, - const bool debug); - -std::tuple - RasterizeGaussiansBackwardCUDA( - const torch::Tensor& background, - const torch::Tensor& means3D, - const torch::Tensor& radii, - const torch::Tensor& colors, - const torch::Tensor& scales, - const torch::Tensor& rotations, - const float scale_modifier, - const torch::Tensor& cov3D_precomp, - const torch::Tensor& viewmatrix, - const torch::Tensor& projmatrix, - const float tan_fovx, - const float tan_fovy, - const torch::Tensor& dL_dout_color, - const torch::Tensor& sh, - const int degree, - const torch::Tensor& campos, - const torch::Tensor& geomBuffer, - const int R, - const torch::Tensor& binningBuffer, - const torch::Tensor& imageBuffer, - const bool debug); - -torch::Tensor markVisible( - torch::Tensor& means3D, - torch::Tensor& viewmatrix, - torch::Tensor& projmatrix); \ No newline at end of file diff --git a/csrc/test_forward.cu b/csrc/test_forward.cu deleted file mode 100644 index 9fda9d11f..000000000 --- a/csrc/test_forward.cu +++ /dev/null @@ -1,404 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "config.h" -#include "forward.cuh" -#include "tgaimage.h" - -float random_float() { return (float)std::rand() / RAND_MAX; } - - -float4 random_quat() { - float u = random_float(); - float v = random_float(); - float w = random_float(); - return { - sqrt(1.f - u) * sin(2.f * (float) M_PI * v), - sqrt(1.f - u) * cos(2.f * (float) M_PI * v), - sqrt(u) * sin(2.f * (float) M_PI * w), - sqrt(u) * cos(2.f * (float) M_PI * w) - }; -} - -int main(int argc, char *argv[]) { - int num_points = 256; - int n = 16; - if (argc > 1) { - num_points = std::stoi(argv[1]); - } - printf("rendering %d points\n", num_points); - - float bd = 2.f; - if (argc > 2) { - bd = std::stof(argv[2]); - } - printf("using scene bounds %.2f\n", bd); - - int mode = 0; - if (argc > 3) { // debug - mode = std::stoi(argv[3]); - } - printf("%d mode\n", mode); - const float fov_x = M_PI / 2.f; - const int W = 512; - const int H = 512; - const float focal = 0.5 * (float)W / tan(0.5 * fov_x); - const dim3 tile_bounds = { - (W + BLOCK_X - 1) / BLOCK_X, (H + BLOCK_Y - 1) / BLOCK_Y, 1}; - const dim3 img_size = {W, H, 1}; - const dim3 block = {BLOCK_X, BLOCK_Y, 1}; - - int num_cov3d = num_points * 6; - int num_view = 16; - const int C = 3; - - float3 *means = new float3[num_points]; - float3 *scales = new float3[num_points]; - float4 *quats = new float4[num_points]; - float *rgbs = new float[C * num_points]; - float *opacities = new float[num_points]; - // world to camera matrix - float viewmat[] = { - 1.f, - 0.f, - 0.f, - 0.f, - 0.f, - 1.f, - 0.f, - 0.f, - 0.f, - 0.f, - 1.f, - 10.f, - 0.f, - 0.f, - 0.f, - 1.f}; - - // random initialization of gaussians - std::srand(std::time(nullptr)); - if (mode == 0) { - for (int i = 0; i < num_points; ++i) { - float v = (float)i - (float)num_points * 0.5f; - means[i] = {v * 0.1f, v * 0.1f, (float)i * 0.1f}; - printf("%d, %.2f %.2f %.2f\n", i, means[i].x, means[i].y, means[i].z); - scales[i] = {1.f, 0.5f, 1.f}; - for (int c = 0; c < C; ++c) { - rgbs[C * i + c] = (float)(i % C == c); - } - quats[i] = {1.f, 0.f, 0.f, 0.f}; - opacities[i] = 0.9f; - } - } else if (mode == 1) { - for (int i = 0; i < num_points; ++i) { - means[i] = { - bd * (random_float() - 0.5f), - bd * (random_float() - 0.5f), - bd * (random_float() - 0.5f) - }; - printf("%d, %.2f %.2f %.2f\n", i, means[i].x, means[i].y, means[i].z); - scales[i] = {random_float(), random_float(), random_float()}; - for (int c = 0; c < C; ++c) { - rgbs[C * i + c] = random_float(); - } - quats[i] = {1.f, 0.f, 0.f, 0.f}; - opacities[i] = 0.9f; - } - } else { - for (int i = 0; i < n; ++i) { - for (int j = 0; j < n; ++j) { - int idx = n * i + j; - float x = 2.f * ((float) j / (float) n - 0.5f); - float y = 2.f * ((float) i / (float) n - 0.5f); - // float z = 2.f * (float) idx / (float) num_points; - float z = 5.f + random_float(); - // float z = (float) idx; - means[idx] = {x, y, z}; - printf("%d, %.2f %.2f %.2f\n", idx, x, y, z); - scales[idx] = {0.5f, 0.5f, 0.5f}; - for (int c = 0; c < C; ++c) { - // rgbs[C * idx + c] = (float)(i % C == c); - rgbs[C * idx + c] = random_float(); - } - quats[idx] = {0.f, 0.f, 0.f, 1.f}; // w x y z convention - opacities[idx] = 0.9f; - } - } - } - - float3 *scales_d, *means_d; - float *rgbs_d; - float4 *quats_d; - float *viewmat_d, *opacities_d; - - cudaMalloc((void **)&scales_d, num_points * sizeof(float3)); - cudaMalloc((void **)&means_d, num_points * sizeof(float3)); - cudaMalloc((void **)&quats_d, num_points * sizeof(float4)); - cudaMalloc((void **)&rgbs_d, C * num_points * sizeof(float)); - cudaMalloc((void **)&opacities_d, num_points * sizeof(float)); - cudaMalloc((void **)&viewmat_d, num_view * sizeof(float)); - - cudaMemcpy( - scales_d, scales, num_points * sizeof(float3), cudaMemcpyHostToDevice - ); - cudaMemcpy( - means_d, means, num_points * sizeof(float3), cudaMemcpyHostToDevice - ); - cudaMemcpy( - rgbs_d, rgbs, C * num_points * sizeof(float), cudaMemcpyHostToDevice - ); - cudaMemcpy( - opacities_d, - opacities, - num_points * sizeof(float), - cudaMemcpyHostToDevice - ); - cudaMemcpy( - quats_d, quats, num_points * sizeof(float4), cudaMemcpyHostToDevice - ); - cudaMemcpy( - viewmat_d, viewmat, num_view * sizeof(float), cudaMemcpyHostToDevice - ); - - // allocate memory for outputs - float *covs3d = new float[num_cov3d]; - float2 *xy = new float2[num_points]; - float *z = new float[num_points]; - int *radii = new int[num_points]; - float3 *conics = new float3[num_points]; - int32_t *num_tiles_hit = new int32_t[num_points]; - - float *covs3d_d, *z_d; - float2 *xy_d; - float3 *conics_d; - int *radii_d; - int32_t *num_tiles_hit_d; - cudaMalloc((void **)&covs3d_d, num_cov3d * sizeof(float)); - cudaMalloc((void **)&xy_d, num_points * sizeof(float2)); - cudaMalloc((void **)&z_d, num_points * sizeof(float)); - cudaMalloc((void **)&radii_d, num_points * sizeof(int)); - cudaMalloc((void **)&conics_d, num_points * sizeof(float3)); - cudaMalloc((void **)&num_tiles_hit_d, num_points * sizeof(int32_t)); - - printf("projecting into 2d...\n"); - project_gaussians_forward_impl( - num_points, - means_d, - scales_d, - 1.f, - quats_d, - viewmat_d, - viewmat_d, - focal, - focal, - img_size, - tile_bounds, - covs3d_d, - xy_d, - z_d, - radii_d, - conics_d, - num_tiles_hit_d - ); - cudaMemcpy( - covs3d, covs3d_d, num_cov3d * sizeof(float), cudaMemcpyDeviceToHost - ); - cudaMemcpy(xy, xy_d, num_points * sizeof(float2), cudaMemcpyDeviceToHost); - cudaMemcpy(z, z_d, num_points * sizeof(float), cudaMemcpyDeviceToHost); - cudaMemcpy( - radii, radii_d, num_points * sizeof(int), cudaMemcpyDeviceToHost - ); - cudaMemcpy( - num_tiles_hit, - num_tiles_hit_d, - num_points * sizeof(int32_t), - cudaMemcpyDeviceToHost - ); - - printf("binning and sorting...\n"); - int32_t num_intersects; - int32_t *cum_tiles_hit = new int32_t[num_points]; - int32_t *cum_tiles_hit_d; - cudaMalloc((void **)&cum_tiles_hit_d, num_points * sizeof(int32_t)); - compute_cumulative_intersects( - num_points, num_tiles_hit_d, num_intersects, cum_tiles_hit_d - ); - printf("num_intersects %d\n", num_intersects); - // cudaMemcpy(cum_tiles_hit, cum_tiles_hit_d, num_points * sizeof(int32_t), - // cudaMemcpyDeviceToHost); for (int i = 0; i < num_points; ++i) { - // printf("num_tiles_hit %d, %d\n", i, num_tiles_hit[i]); - // } - - int64_t *isect_ids_sorted_d; - int32_t *gaussian_ids_sorted_d; // sorted by tile and depth - int64_t *isect_ids_sorted = new int64_t[num_intersects]; - int32_t *gaussian_ids_sorted = new int32_t[num_intersects]; - cudaMalloc((void **)&isect_ids_sorted_d, num_intersects * sizeof(int64_t)); - cudaMalloc( - (void **)&gaussian_ids_sorted_d, num_intersects * sizeof(int32_t) - ); - - int64_t *isect_ids_unsorted_d; - int32_t *gaussian_ids_unsorted_d; // sorted by tile and depth - int64_t *isect_ids_unsorted = new int64_t[num_intersects]; - int32_t *gaussian_ids_unsorted = new int32_t[num_intersects]; - cudaMalloc( - (void **)&isect_ids_unsorted_d, num_intersects * sizeof(int64_t) - ); - cudaMalloc( - (void **)&gaussian_ids_unsorted_d, num_intersects * sizeof(int32_t) - ); - - int num_tiles = tile_bounds.x * tile_bounds.y; - int2 *tile_bins_d; // start and end indices for each tile - int2 *tile_bins = new int2[num_tiles]; - cudaMalloc((void **)&tile_bins_d, num_tiles * sizeof(int2)); - - bin_and_sort_gaussians( - num_points, - num_intersects, - xy_d, - z_d, - radii_d, - cum_tiles_hit_d, - tile_bounds, - isect_ids_unsorted_d, - gaussian_ids_unsorted_d, - isect_ids_sorted_d, - gaussian_ids_sorted_d, - tile_bins_d - ); - - // cudaMemcpy( - // isect_ids_unsorted, - // isect_ids_unsorted_d, - // num_intersects * sizeof(int64_t), - // cudaMemcpyDeviceToHost - // ); - // cudaMemcpy( - // gaussian_ids_unsorted, - // gaussian_ids_unsorted_d, - // num_intersects * sizeof(int32_t), - // cudaMemcpyDeviceToHost - // ); - // cudaMemcpy( - // isect_ids_sorted, - // isect_ids_sorted_d, - // num_intersects * sizeof(int64_t), - // cudaMemcpyDeviceToHost - // ); - cudaMemcpy( - gaussian_ids_sorted, - gaussian_ids_sorted_d, - num_intersects * sizeof(int32_t), - cudaMemcpyDeviceToHost - ); - // - // for (int i = 0; i < num_intersects; ++i) { - // printf("%d unsorted isect %016lx point %03d\n", i, - // isect_ids_unsorted[i], gaussian_ids_unsorted[i]); - // } - // for (int i = 0; i < num_intersects; ++i) { - // printf("sorted isect %016lx point %03d\n", isect_ids_sorted[i], - // gaussian_ids_sorted[i]); - // } - // cudaMemcpy(tile_bins, tile_bins_d, num_tiles * sizeof(int2), - // cudaMemcpyDeviceToHost); for (int i = 0; i < num_tiles; ++i) { - // printf("tile_bins %d %d %d\t", i, tile_bins[i].x, tile_bins[i].y); - // } - // std::cout << std::endl; - - float *out_img = new float[C * W * H]; - float *out_img_d; - float *final_Ts_d; - int *final_idx_d; - cudaMalloc((void **)&out_img_d, C * W * H * sizeof(float)); - cudaMalloc((void **)&final_Ts_d, W * H * sizeof(float)); - cudaMalloc((void **)&final_idx_d, W * H * sizeof(int)); - - // const float background[3] = {1.0f, 1.0f, 1.0f}; - const float background[3] = {0.0f, 0.0f, 0.0f}; - float *background_d; - cudaMalloc((void **)&background_d, C * sizeof(float)); - cudaMemcpy( - background_d, background, C * sizeof(float), cudaMemcpyHostToDevice - ); - - printf("final rasterize pass\n"); - rasterize_forward_impl ( - tile_bounds, - block, - img_size, - C, - gaussian_ids_sorted_d, - tile_bins_d, - xy_d, - conics_d, - rgbs_d, - opacities_d, - final_Ts_d, - final_idx_d, - out_img_d, - background_d - ); - cudaMemcpy( - out_img, out_img_d, C * W * H * sizeof(float), cudaMemcpyDeviceToHost - ); - TGAImage image(W, H, TGAImage::RGB); - int idx; - float3 c; - TGAColor col; - for (int y = 0; y < H; ++y) { - for (int x = 0; x < W; ++x) { - idx = y * W + x; - for (int c = 0; c < C; ++c) { - col[c] = (uint8_t)(255.f * out_img[C * idx + c]); - } - col[3] = 255; - image.set(x, y, col); - } - } - image.write_tga_file("output.tga"); - - printf("freeing memory...\n"); - - cudaFree(scales_d); - cudaFree(quats_d); - cudaFree(rgbs_d); - cudaFree(opacities_d); - cudaFree(covs3d_d); - cudaFree(viewmat_d); - cudaFree(xy_d); - cudaFree(z_d); - cudaFree(radii_d); - cudaFree(num_tiles_hit_d); - cudaFree(cum_tiles_hit_d); - cudaFree(tile_bins_d); - cudaFree(isect_ids_unsorted_d); - cudaFree(gaussian_ids_unsorted_d); - cudaFree(isect_ids_sorted_d); - cudaFree(gaussian_ids_sorted_d); - cudaFree(conics_d); - cudaFree(out_img_d); - - delete[] scales; - delete[] means; - delete[] rgbs; - delete[] opacities; - delete[] quats; - delete[] covs3d; - delete[] xy; - delete[] z; - delete[] radii; - delete[] num_tiles_hit; - delete[] cum_tiles_hit; - delete[] tile_bins; - delete[] gaussian_ids_sorted; - delete[] out_img; - return 0; -} diff --git a/csrc/tgaimage.cpp b/csrc/tgaimage.cpp deleted file mode 100644 index d9164716f..000000000 --- a/csrc/tgaimage.cpp +++ /dev/null @@ -1,262 +0,0 @@ -#include "tgaimage.h" -#include -#include - -TGAImage::TGAImage(const int w, const int h, const int bpp) - : w(w), h(h), bpp(bpp), data(w * h * bpp, 0) {} - -bool TGAImage::read_tga_file(const std::string filename) { - std::ifstream in; - in.open(filename, std::ios::binary); - if (!in.is_open()) { - std::cerr << "can't open file " << filename << "\n"; - return false; - } - TGAHeader header; - in.read(reinterpret_cast(&header), sizeof(header)); - if (!in.good()) { - std::cerr << "an error occured while reading the header\n"; - return false; - } - w = header.width; - h = header.height; - bpp = header.bitsperpixel >> 3; - if (w <= 0 || h <= 0 || (bpp != GRAYSCALE && bpp != RGB && bpp != RGBA)) { - std::cerr << "bad bpp (or width/height) value\n"; - return false; - } - size_t nbytes = bpp * w * h; - data = std::vector(nbytes, 0); - if (3 == header.datatypecode || 2 == header.datatypecode) { - in.read(reinterpret_cast(data.data()), nbytes); - if (!in.good()) { - std::cerr << "an error occured while reading the data\n"; - return false; - } - } else if (10 == header.datatypecode || 11 == header.datatypecode) { - if (!load_rle_data(in)) { - std::cerr << "an error occured while reading the data\n"; - return false; - } - } else { - std::cerr << "unknown file format " << (int)header.datatypecode << "\n"; - return false; - } - if (!(header.imagedescriptor & 0x20)) - flip_vertically(); - if (header.imagedescriptor & 0x10) - flip_horizontally(); - std::cerr << w << "x" << h << "/" << bpp * 8 << "\n"; - return true; -} - -bool TGAImage::load_rle_data(std::ifstream &in) { - size_t pixelcount = w * h; - size_t currentpixel = 0; - size_t currentbyte = 0; - TGAColor colorbuffer; - do { - std::uint8_t chunkheader = 0; - chunkheader = in.get(); - if (!in.good()) { - std::cerr << "an error occured while reading the data\n"; - return false; - } - if (chunkheader < 128) { - chunkheader++; - for (int i = 0; i < chunkheader; i++) { - in.read(reinterpret_cast(colorbuffer.bgra), bpp); - if (!in.good()) { - std::cerr << "an error occured while reading the header\n"; - return false; - } - for (int t = 0; t < bpp; t++) - data[currentbyte++] = colorbuffer.bgra[t]; - currentpixel++; - if (currentpixel > pixelcount) { - std::cerr << "Too many pixels read\n"; - return false; - } - } - } else { - chunkheader -= 127; - in.read(reinterpret_cast(colorbuffer.bgra), bpp); - if (!in.good()) { - std::cerr << "an error occured while reading the header\n"; - return false; - } - for (int i = 0; i < chunkheader; i++) { - for (int t = 0; t < bpp; t++) - data[currentbyte++] = colorbuffer.bgra[t]; - currentpixel++; - if (currentpixel > pixelcount) { - std::cerr << "Too many pixels read\n"; - return false; - } - } - } - } while (currentpixel < pixelcount); - return true; -} - -bool TGAImage::write_tga_file( - const std::string filename, const bool vflip, const bool rle -) const { - constexpr std::uint8_t developer_area_ref[4] = {0, 0, 0, 0}; - constexpr std::uint8_t extension_area_ref[4] = {0, 0, 0, 0}; - constexpr std::uint8_t footer[18] = { - 'T', - 'R', - 'U', - 'E', - 'V', - 'I', - 'S', - 'I', - 'O', - 'N', - '-', - 'X', - 'F', - 'I', - 'L', - 'E', - '.', - '\0'}; - std::ofstream out; - out.open(filename, std::ios::binary); - if (!out.is_open()) { - std::cerr << "can't open file " << filename << "\n"; - return false; - } - TGAHeader header = {}; - header.bitsperpixel = bpp << 3; - header.width = w; - header.height = h; - header.datatypecode = (bpp == GRAYSCALE ? (rle ? 11 : 3) : (rle ? 10 : 2)); - header.imagedescriptor = - vflip ? 0x00 : 0x20; // top-left or bottom-left origin - out.write(reinterpret_cast(&header), sizeof(header)); - if (!out.good()) { - std::cerr << "can't dump the tga file\n"; - return false; - } - if (!rle) { - out.write(reinterpret_cast(data.data()), w * h * bpp); - if (!out.good()) { - std::cerr << "can't unload raw data\n"; - return false; - } - } else if (!unload_rle_data(out)) { - std::cerr << "can't unload rle data\n"; - return false; - } - out.write( - reinterpret_cast(developer_area_ref), - sizeof(developer_area_ref) - ); - if (!out.good()) { - std::cerr << "can't dump the tga file\n"; - return false; - } - out.write( - reinterpret_cast(extension_area_ref), - sizeof(extension_area_ref) - ); - if (!out.good()) { - std::cerr << "can't dump the tga file\n"; - return false; - } - out.write(reinterpret_cast(footer), sizeof(footer)); - if (!out.good()) { - std::cerr << "can't dump the tga file\n"; - return false; - } - return true; -} - -// TODO: it is not necessary to break a raw chunk for two equal pixels (for the -// matter of the resulting size) -bool TGAImage::unload_rle_data(std::ofstream &out) const { - const std::uint8_t max_chunk_length = 128; - size_t npixels = w * h; - size_t curpix = 0; - while (curpix < npixels) { - size_t chunkstart = curpix * bpp; - size_t curbyte = curpix * bpp; - std::uint8_t run_length = 1; - bool raw = true; - while (curpix + run_length < npixels && run_length < max_chunk_length) { - bool succ_eq = true; - for (int t = 0; succ_eq && t < bpp; t++) - succ_eq = (data[curbyte + t] == data[curbyte + t + bpp]); - curbyte += bpp; - if (1 == run_length) - raw = !succ_eq; - if (raw && succ_eq) { - run_length--; - break; - } - if (!raw && !succ_eq) - break; - run_length++; - } - curpix += run_length; - out.put(raw ? run_length - 1 : run_length + 127); - if (!out.good()) { - std::cerr << "can't dump the tga file\n"; - return false; - } - out.write( - reinterpret_cast(data.data() + chunkstart), - (raw ? run_length * bpp : bpp) - ); - if (!out.good()) { - std::cerr << "can't dump the tga file\n"; - return false; - } - } - return true; -} - -TGAColor TGAImage::get(const int x, const int y) const { - if (!data.size() || x < 0 || y < 0 || x >= w || y >= h) - return {}; - TGAColor ret = {0, 0, 0, 0, bpp}; - const std::uint8_t *p = data.data() + (x + y * w) * bpp; - for (int i = bpp; i--; ret.bgra[i] = p[i]) - ; - return ret; -} - -void TGAImage::set(int x, int y, const TGAColor &c) { - if (!data.size() || x < 0 || y < 0 || x >= w || y >= h) - return; - memcpy(data.data() + (x + y * w) * bpp, c.bgra, bpp); -} - -void TGAImage::flip_horizontally() { - int half = w >> 1; - for (int i = 0; i < half; i++) - for (int j = 0; j < h; j++) - for (int b = 0; b < bpp; b++) - std::swap( - data[(i + j * w) * bpp + b], - data[(w - 1 - i + j * w) * bpp + b] - ); -} - -void TGAImage::flip_vertically() { - int half = h >> 1; - for (int i = 0; i < w; i++) - for (int j = 0; j < half; j++) - for (int b = 0; b < bpp; b++) - std::swap( - data[(i + j * w) * bpp + b], - data[(i + (h - 1 - j) * w) * bpp + b] - ); -} - -int TGAImage::width() const { return w; } - -int TGAImage::height() const { return h; } diff --git a/csrc/tgaimage.h b/csrc/tgaimage.h deleted file mode 100644 index 9aa44b7e2..000000000 --- a/csrc/tgaimage.h +++ /dev/null @@ -1,55 +0,0 @@ -#pragma once -#include -#include -#include - -#pragma pack(push, 1) -struct TGAHeader { - std::uint8_t idlength = 0; - std::uint8_t colormaptype = 0; - std::uint8_t datatypecode = 0; - std::uint16_t colormaporigin = 0; - std::uint16_t colormaplength = 0; - std::uint8_t colormapdepth = 0; - std::uint16_t x_origin = 0; - std::uint16_t y_origin = 0; - std::uint16_t width = 0; - std::uint16_t height = 0; - std::uint8_t bitsperpixel = 0; - std::uint8_t imagedescriptor = 0; -}; -#pragma pack(pop) - -struct TGAColor { - std::uint8_t bgra[4] = {0, 0, 0, 0}; - std::uint8_t bytespp = 4; - std::uint8_t &operator[](const int i) { return bgra[i]; } -}; - -struct TGAImage { - enum Format { GRAYSCALE = 1, RGB = 3, RGBA = 4 }; - - TGAImage() = default; - TGAImage(const int w, const int h, const int bpp); - bool read_tga_file(const std::string filename); - bool write_tga_file( - const std::string filename, - const bool vflip = true, - const bool rle = true - ) const; - void flip_horizontally(); - void flip_vertically(); - TGAColor get(const int x, const int y) const; - void set(const int x, const int y, const TGAColor &c); - int width() const; - int height() const; - - private: - bool load_rle_data(std::ifstream &in); - bool unload_rle_data(std::ofstream &out) const; - - int w = 0; - int h = 0; - std::uint8_t bpp = 0; - std::vector data = {}; -}; diff --git a/csrc/third_party/doctest b/csrc/third_party/doctest deleted file mode 160000 index ae7a13539..000000000 --- a/csrc/third_party/doctest +++ /dev/null @@ -1 +0,0 @@ -Subproject commit ae7a13539fb71f270b87eb2e874fbac80bc8dda2 diff --git a/diff_rast/__init__.py b/diff_rast/__init__.py new file mode 100644 index 000000000..066567ca2 --- /dev/null +++ b/diff_rast/__init__.py @@ -0,0 +1,9 @@ +from .project_gaussians import ProjectGaussians +from .rasterize import RasterizeGaussians +from .version import __version__ + +__all__ = [ + "__version__", + "ProjectGaussians", + "RasterizeGaussians", +] diff --git a/diff_rast/_torch_impl.py b/diff_rast/_torch_impl.py new file mode 100644 index 000000000..6dc3d9214 --- /dev/null +++ b/diff_rast/_torch_impl.py @@ -0,0 +1,109 @@ +"""Pure PyTorch implementation""" + +from jaxtyping import Float +import torch +from torch import Tensor + + +def compute_sh_color( + viewdirs: Float[Tensor, "*batch 3"], sh_coeffs: Float[Tensor, "*batch D C"] +): + """ + :param viewdirs (*, C) + :param sh_coeffs (*, D, C) sh coefficients for each color channel + return colors (*, C) + """ + *dims, dim_sh, C = sh_coeffs.shape + bases = eval_sh_bases(dim_sh, viewdirs) # (*, dim_sh) + return (bases[..., None] * sh_coeffs).sum(dim=-2) + + +""" +Taken from https://github.com/sxyu/svox2 +""" + +SH_C0 = 0.28209479177387814 +SH_C1 = 0.4886025119029199 +SH_C2 = [ + 1.0925484305920792, + -1.0925484305920792, + 0.31539156525252005, + -1.0925484305920792, + 0.5462742152960396, +] +SH_C3 = [ + -0.5900435899266435, + 2.890611442640554, + -0.4570457994644658, + 0.3731763325901154, + -0.4570457994644658, + 1.445305721320277, + -0.5900435899266435, +] +SH_C4 = [ + 2.5033429417967046, + -1.7701307697799304, + 0.9461746957575601, + -0.6690465435572892, + 0.10578554691520431, + -0.6690465435572892, + 0.47308734787878004, + -1.7701307697799304, + 0.6258357354491761, +] + +MAX_SH_BASIS = 10 + + +def eval_sh_bases(basis_dim: int, dirs: torch.Tensor): + """ + Evaluate spherical harmonics bases at unit directions, + without taking linear combination. + At each point, the final result may the be + obtained through simple multiplication. + + :param basis_dim: int SH basis dim. Currently, 1-25 square numbers supported + :param dirs: torch.Tensor (..., 3) unit directions + + :return: torch.Tensor (..., basis_dim) + """ + result = torch.empty( + (*dirs.shape[:-1], basis_dim), dtype=dirs.dtype, device=dirs.device + ) + result[..., 0] = SH_C0 + if basis_dim > 1: + x, y, z = dirs.unbind(-1) + result[..., 1] = -SH_C1 * y + result[..., 2] = SH_C1 * z + result[..., 3] = -SH_C1 * x + if basis_dim > 4: + xx, yy, zz = x * x, y * y, z * z + xy, yz, xz = x * y, y * z, x * z + result[..., 4] = SH_C2[0] * xy + result[..., 5] = SH_C2[1] * yz + result[..., 6] = SH_C2[2] * (2.0 * zz - xx - yy) + result[..., 7] = SH_C2[3] * xz + result[..., 8] = SH_C2[4] * (xx - yy) + + if basis_dim > 9: + result[..., 9] = SH_C3[0] * y * (3 * xx - yy) + result[..., 10] = SH_C3[1] * xy * z + result[..., 11] = SH_C3[2] * y * (4 * zz - xx - yy) + result[..., 12] = SH_C3[3] * z * (2 * zz - 3 * xx - 3 * yy) + result[..., 13] = SH_C3[4] * x * (4 * zz - xx - yy) + result[..., 14] = SH_C3[5] * z * (xx - yy) + result[..., 15] = SH_C3[6] * x * (xx - 3 * yy) + + if basis_dim > 16: + result[..., 16] = SH_C4[0] * xy * (xx - yy) + result[..., 17] = SH_C4[1] * yz * (3 * xx - yy) + result[..., 18] = SH_C4[2] * xy * (7 * zz - 1) + result[..., 19] = SH_C4[3] * yz * (7 * zz - 3) + result[..., 20] = SH_C4[4] * (zz * (35 * zz - 30) + 3) + result[..., 21] = SH_C4[5] * xz * (7 * zz - 3) + result[..., 22] = SH_C4[6] * (xx - yy) * (7 * zz - 1) + result[..., 23] = SH_C4[7] * xz * (xx - 3 * yy) + result[..., 24] = SH_C4[8] * ( + xx * (xx - 3 * yy) - yy * (3 * xx - yy) + ) + return result diff --git a/diff_rast/cuda/__init__.py b/diff_rast/cuda/__init__.py new file mode 100644 index 000000000..f995b41fe --- /dev/null +++ b/diff_rast/cuda/__init__.py @@ -0,0 +1,20 @@ +from typing import Callable + + +def _make_lazy_cuda_func(name: str) -> Callable: + def call_cuda(*args, **kwargs): + # pylint: disable=import-outside-toplevel + from ._backend import _C + + return getattr(_C, name)(*args, **kwargs) + + return call_cuda + + +rasterize_forward = _make_lazy_cuda_func("rasterize_forward") +rasterize_backward = _make_lazy_cuda_func("rasterize_backward") +compute_cov2d_bounds_forward = _make_lazy_cuda_func("compute_cov2d_bounds_forward") +project_gaussians_forward = _make_lazy_cuda_func("project_gaussians_forward") +project_gaussians_backward = _make_lazy_cuda_func("project_gaussians_backward") +compute_sh_forward = _make_lazy_cuda_func("compute_sh_forward") +compute_sh_backward = _make_lazy_cuda_func("compute_sh_backward") diff --git a/diff_rast/cuda/_backend.py b/diff_rast/cuda/_backend.py new file mode 100644 index 000000000..2d8ce35d0 --- /dev/null +++ b/diff_rast/cuda/_backend.py @@ -0,0 +1,92 @@ +import glob +import json +import os +import shutil +from subprocess import DEVNULL, call + +from rich.console import Console +from torch.utils.cpp_extension import _get_build_directory, load + +PATH = os.path.dirname(os.path.abspath(__file__)) + + +def cuda_toolkit_available(): + """Check if the nvcc is avaiable on the machine.""" + try: + call(["nvcc"], stdout=DEVNULL, stderr=DEVNULL) + return True + except FileNotFoundError: + return False + + +def cuda_toolkit_version(): + """Get the cuda toolkit version.""" + cuda_home = os.path.join(os.path.dirname(shutil.which("nvcc")), "..") + if os.path.exists(os.path.join(cuda_home, "version.txt")): + with open(os.path.join(cuda_home, "version.txt")) as f: + cuda_version = f.read().strip().split()[-1] + elif os.path.exists(os.path.join(cuda_home, "version.json")): + with open(os.path.join(cuda_home, "version.json")) as f: + cuda_version = json.load(f)["cuda"]["version"] + else: + raise RuntimeError("Cannot find the cuda version.") + return cuda_version + + +name = "diff_rast_cuda" +build_dir = _get_build_directory(name, verbose=False) +extra_include_paths = [os.path.join(PATH, "csrc/third_party/glm")] +extra_cflags = ["-O3"] +extra_cuda_cflags = ["-O3"] + +_C = None +sources = list(glob.glob(os.path.join(PATH, "csrc/*.cu"))) + list( + glob.glob(os.path.join(PATH, "csrc/*.cpp")) +) +# sources = [ +# os.path.join(PATH, "csrc/ext.cpp"), +# os.path.join(PATH, "csrc/rasterize.cu"), +# os.path.join(PATH, "csrc/bindings.cu"), +# os.path.join(PATH, "csrc/forward.cu"), +# os.path.join(PATH, "csrc/backward.cu"), +# ] + +try: + # try to import the compiled module (via setup.py) + from diff_rast import csrc as _C +except ImportError: + # if failed, try with JIT compilation + if cuda_toolkit_available(): + if os.listdir(build_dir) != []: + # If the build exists, we assume the extension has been built + # and we can load it. + + _C = load( + name=name, + sources=sources, + extra_cflags=extra_cflags, + extra_cuda_cflags=extra_cuda_cflags, + extra_include_paths=extra_include_paths, + ) + else: + # Build from scratch. Remove the build directory just to be safe: pytorch jit might stuck + # if the build directory exists. + shutil.rmtree(build_dir) + with Console().status( + "[bold yellow]DiffRast: Setting up CUDA (This may take a few minutes the first time)", + spinner="bouncingBall", + ): + _C = load( + name=name, + sources=sources, + extra_cflags=extra_cflags, + extra_cuda_cflags=extra_cuda_cflags, + extra_include_paths=extra_include_paths, + ) + else: + Console().print( + "[yellow]DiffRast: No CUDA toolkit found. DiffRast will be disabled.[/yellow]" + ) + + +__all__ = ["_C"] diff --git a/csrc/CMakeLists.txt b/diff_rast/cuda/csrc/CMakeLists.txt similarity index 100% rename from csrc/CMakeLists.txt rename to diff_rast/cuda/csrc/CMakeLists.txt diff --git a/csrc/backward.cu b/diff_rast/cuda/csrc/backward.cu similarity index 100% rename from csrc/backward.cu rename to diff_rast/cuda/csrc/backward.cu diff --git a/csrc/backward.cuh b/diff_rast/cuda/csrc/backward.cuh similarity index 100% rename from csrc/backward.cuh rename to diff_rast/cuda/csrc/backward.cuh diff --git a/csrc/bindings.cu b/diff_rast/cuda/csrc/bindings.cu similarity index 100% rename from csrc/bindings.cu rename to diff_rast/cuda/csrc/bindings.cu diff --git a/csrc/bindings.h b/diff_rast/cuda/csrc/bindings.h similarity index 100% rename from csrc/bindings.h rename to diff_rast/cuda/csrc/bindings.h diff --git a/csrc/config.h b/diff_rast/cuda/csrc/config.h similarity index 100% rename from csrc/config.h rename to diff_rast/cuda/csrc/config.h diff --git a/csrc/ext.cpp b/diff_rast/cuda/csrc/ext.cpp similarity index 100% rename from csrc/ext.cpp rename to diff_rast/cuda/csrc/ext.cpp diff --git a/csrc/forward.cu b/diff_rast/cuda/csrc/forward.cu similarity index 99% rename from csrc/forward.cu rename to diff_rast/cuda/csrc/forward.cu index 26c7d2ad6..aebf60816 100644 --- a/csrc/forward.cu +++ b/diff_rast/cuda/csrc/forward.cu @@ -522,6 +522,7 @@ __host__ __device__ float3 project_cov3d_ewa( __host__ __device__ void scale_rot_to_cov3d( const float3 scale, const float glob_scale, const float4 quat, float *cov3d ) { + // printf("quat %.2f %.2f %.2f %.2f\n", quat.x, quat.y, quat.z, quat.w); glm::mat3 R = quat_to_rotmat(quat); // printf("R %.2f %.2f %.2f\n", R[0][0], R[1][1], R[2][2]); glm::mat3 S = scale_to_mat(scale, glob_scale); diff --git a/csrc/forward.cuh b/diff_rast/cuda/csrc/forward.cuh similarity index 100% rename from csrc/forward.cuh rename to diff_rast/cuda/csrc/forward.cuh diff --git a/csrc/helpers.cuh b/diff_rast/cuda/csrc/helpers.cuh similarity index 100% rename from csrc/helpers.cuh rename to diff_rast/cuda/csrc/helpers.cuh diff --git a/csrc/rasterize.cu b/diff_rast/cuda/csrc/rasterize.cu similarity index 100% rename from csrc/rasterize.cu rename to diff_rast/cuda/csrc/rasterize.cu diff --git a/csrc/rasterize.h b/diff_rast/cuda/csrc/rasterize.h similarity index 100% rename from csrc/rasterize.h rename to diff_rast/cuda/csrc/rasterize.h diff --git a/csrc/serial_backward.cu b/diff_rast/cuda/csrc/serial_backward.cu similarity index 100% rename from csrc/serial_backward.cu rename to diff_rast/cuda/csrc/serial_backward.cu diff --git a/csrc/serial_backward.cuh b/diff_rast/cuda/csrc/serial_backward.cuh similarity index 100% rename from csrc/serial_backward.cuh rename to diff_rast/cuda/csrc/serial_backward.cuh diff --git a/csrc/sh.cuh b/diff_rast/cuda/csrc/sh.cuh similarity index 100% rename from csrc/sh.cuh rename to diff_rast/cuda/csrc/sh.cuh diff --git a/csrc/third_party/glm b/diff_rast/cuda/csrc/third_party/glm similarity index 100% rename from csrc/third_party/glm rename to diff_rast/cuda/csrc/third_party/glm diff --git a/diff_rast/diff_rast/__init__.py b/diff_rast/diff_rast/__init__.py deleted file mode 100644 index 5f42e0783..000000000 --- a/diff_rast/diff_rast/__init__.py +++ /dev/null @@ -1 +0,0 @@ -import torch \ No newline at end of file diff --git a/diff_rast/diff_rast/_torch_impl.py b/diff_rast/diff_rast/_torch_impl.py deleted file mode 100644 index 4de51a47b..000000000 --- a/diff_rast/diff_rast/_torch_impl.py +++ /dev/null @@ -1,276 +0,0 @@ -"""Pure PyTorch implementation""" - -from jaxtyping import Float -import torch -import torch.nn.functional as F -from torch import Tensor - - -def quat_to_rotmat(quat: Tensor) -> Tensor: - assert quat.shape[-1] == 4, quat.shape - x, y, z, w = torch.unbind(F.normalize(quat, dim=-1), dim=-1) - return torch.stack( - [ - torch.stack( - [ - 1 - 2 * (y**2 + z**2), - 2 * (x * y - w * z), - 2 * (x * z + w * y), - ], - dim=-1, - ), - torch.stack( - [ - 2 * (x * y + w * z), - 1 - 2 * (x**2 + z**2), - 2 * (y * z - w * x), - ], - dim=-1, - ), - torch.stack( - [ - 2 * (x * z - w * y), - 2 * (y * z + w * x), - 1 - 2 * (x**2 + y**2), - ], - dim=-1, - ), - ], - dim=-2, - ) - - -def scale_rot_to_cov3d( - scale: Tensor, glob_scale: float, quat: Tensor -) -> Tensor: - assert scale.shape[-1] == 3, scale.shape - assert quat.shape[-1] == 4, quat.shape - assert scale.shape[:-1] == quat.shape[:-1], (scale.shape, quat.shape) - R = quat_to_rotmat(quat) # (..., 3, 3) - M = R * glob_scale * scale[..., None, :] # (..., 3, 3) - # TODO: save upper right because symmetric - return M @ M.transpose(-1, -2) # (..., 3, 3) - - -def project_cov3d_ewa( - mean3d: Tensor, cov3d: Tensor, viewmat: Tensor, fx: float, fy: float -) -> Tensor: - assert mean3d.shape[-1] == 3, mean3d.shape - assert cov3d.shape[-2:] == (3, 3), cov3d.shape - assert viewmat.shape[-2:] == (4, 4), viewmat.shape - W = viewmat[..., :3, :3] # (..., 3, 3) - p = viewmat[..., :3, 3] # (..., 3) - t = torch.matmul(W, mean3d[..., None])[..., 0] + p # (..., 3) - rz = 1.0 / t[..., 2] # (...,) - rz2 = rz**2 # (...,) - J = torch.stack( - [ - torch.stack( - [fx * rz, torch.zeros_like(rz), -fx * t[..., 0] * rz2], dim=-1 - ), - torch.stack( - [torch.zeros_like(rz), fy * rz, -fy * t[..., 1] * rz2], dim=-1 - ), - ], - dim=-2, - ) # (..., 2, 3) - T = J @ W # (..., 2, 3) - cov2d = T @ cov3d @ T.transpose(-1, -2) # (..., 2, 2) - # add a little blur along axes and (TODO save upper triangular elements) - cov2d[..., 0, 0] = cov2d[..., 0, 0] + 0.1 - cov2d[..., 1, 1] = cov2d[..., 1, 1] + 0.1 - return cov2d - - -def compute_cov2d_bounds(cov2d: Tensor, eps=1e-6): - det = cov2d[..., 0, 0] * cov2d[..., 1, 1] - cov2d[..., 0, 1] ** 2 - det = torch.clamp(det, min=eps) - conic = torch.stack( - [ - cov2d[..., 1, 1] / det, - -cov2d[..., 0, 1] / det, - cov2d[..., 0, 0] / det, - ], - dim=-1, - ) # (..., 3) - b = (cov2d[..., 0, 0] + cov2d[..., 1, 1]) / 2 # (...,) - v1 = b + torch.sqrt(torch.clamp(b**2 - det, min=0.1)) # (...,) - v2 = b - torch.sqrt(torch.clamp(b**2 - det, min=0.1)) # (...,) - radius = torch.ceil(3.0 * torch.sqrt(torch.max(v1, v2))) # (...,) - return conic, radius, det > eps - - -def ndc2pix(x, W): - return 0.5 * ((x + 1.0) * W - 1.0) - - -def project_pix(mat, p, img_size, eps=1e-6): - p_hom = F.pad(p, (0, 1), value=1.0) - p_hom = torch.einsum("...ij,...j->...i", mat, p_hom) - rw = 1.0 / torch.clamp(p_hom[..., 3], min=eps) - p_proj = p_hom[..., :3] * rw[..., None] - u = ndc2pix(p_proj[..., 0], img_size[0]) - v = ndc2pix(p_proj[..., 1], img_size[1]) - return torch.stack([u, v], dim=-1) - - -def clip_near_plane(p, viewmat, thresh=0.1): - R = viewmat[..., :3, :3] - T = viewmat[..., :3, 3] - p_view = torch.matmul(R, p[..., None])[..., 0] + T - return p_view, p_view[..., 2] < thresh - - -def get_tile_bbox(pix_center, pix_radius, tile_bounds, BLOCK_X=16, BLOCK_Y=16): - tile_size = torch.tensor( - [BLOCK_X, BLOCK_Y], dtype=torch.float32, device=pix_center.device - ) - tile_center = pix_center / tile_size - tile_radius = pix_radius[..., None] / tile_size - - top_left = (tile_center - tile_radius).to(torch.int32) - bottom_right = (tile_center + tile_radius).to(torch.int32) + 1 - tile_min = torch.stack( - [ - torch.clamp(top_left[..., 0], 0, tile_bounds[0]), - torch.clamp(top_left[..., 1], 0, tile_bounds[1]), - ], - -1, - ) - tile_max = torch.stack( - [ - torch.clamp(bottom_right[..., 0], 0, tile_bounds[0]), - torch.clamp(bottom_right[..., 1], 0, tile_bounds[1]), - ], - -1, - ) - return tile_min, tile_max - - -def project_gaussians_forward( - means3d, - scales, - glob_scale, - quats, - viewmat, - projmat, - fx, - fy, - img_size, - tile_bounds, -): - p_view, is_close = clip_near_plane(means3d, viewmat) - cov3d = scale_rot_to_cov3d(scales, glob_scale, quats) - cov2d = project_cov3d_ewa(means3d, cov3d, viewmat, fx, fy) - conic, radius, det_valid = compute_cov2d_bounds(cov2d) - center = project_pix(projmat, means3d, img_size) - tile_min, tile_max = get_tile_bbox(center, radius, tile_bounds) - tile_area = (tile_max[..., 0] - tile_min[..., 0]) * ( - tile_max[..., 1] - tile_min[..., 1] - ) - mask = (tile_area > 0) & (~is_close) & det_valid - - num_tiles_hit = tile_area - depths = p_view[..., 2] - radii = radius.to(torch.int32) - xys = center - conics = conic - - return cov3d, xys, depths, radii, conics, num_tiles_hit, mask - - -def compute_sh_color(viewdirs: Float[Tensor, "*batch 3"], sh_coeffs : Float[Tensor, "*batch D C"]): - """ - :param viewdirs (*, C) - :param sh_coeffs (*, D, C) sh coefficients for each color channel - return colors (*, C) - """ - *dims, dim_sh, C = sh_coeffs.shape - bases = eval_sh_bases(dim_sh, viewdirs) # (*, dim_sh) - return (bases[..., None] * sh_coeffs).sum(dim=-2) - - -""" -Taken from https://github.com/sxyu/svox2 -""" - -SH_C0 = 0.28209479177387814 -SH_C1 = 0.4886025119029199 -SH_C2 = [ - 1.0925484305920792, - -1.0925484305920792, - 0.31539156525252005, - -1.0925484305920792, - 0.5462742152960396 -] -SH_C3 = [ - -0.5900435899266435, - 2.890611442640554, - -0.4570457994644658, - 0.3731763325901154, - -0.4570457994644658, - 1.445305721320277, - -0.5900435899266435 -] -SH_C4 = [ - 2.5033429417967046, - -1.7701307697799304, - 0.9461746957575601, - -0.6690465435572892, - 0.10578554691520431, - -0.6690465435572892, - 0.47308734787878004, - -1.7701307697799304, - 0.6258357354491761, -] - -MAX_SH_BASIS = 10 - -def eval_sh_bases(basis_dim : int, dirs : torch.Tensor): - """ - Evaluate spherical harmonics bases at unit directions, - without taking linear combination. - At each point, the final result may the be - obtained through simple multiplication. - - :param basis_dim: int SH basis dim. Currently, 1-25 square numbers supported - :param dirs: torch.Tensor (..., 3) unit directions - - :return: torch.Tensor (..., basis_dim) - """ - result = torch.empty((*dirs.shape[:-1], basis_dim), dtype=dirs.dtype, device=dirs.device) - result[..., 0] = SH_C0 - if basis_dim > 1: - x, y, z = dirs.unbind(-1) - result[..., 1] = -SH_C1 * y; - result[..., 2] = SH_C1 * z; - result[..., 3] = -SH_C1 * x; - if basis_dim > 4: - xx, yy, zz = x * x, y * y, z * z - xy, yz, xz = x * y, y * z, x * z - result[..., 4] = SH_C2[0] * xy; - result[..., 5] = SH_C2[1] * yz; - result[..., 6] = SH_C2[2] * (2.0 * zz - xx - yy); - result[..., 7] = SH_C2[3] * xz; - result[..., 8] = SH_C2[4] * (xx - yy); - - if basis_dim > 9: - result[..., 9] = SH_C3[0] * y * (3 * xx - yy); - result[..., 10] = SH_C3[1] * xy * z; - result[..., 11] = SH_C3[2] * y * (4 * zz - xx - yy); - result[..., 12] = SH_C3[3] * z * (2 * zz - 3 * xx - 3 * yy); - result[..., 13] = SH_C3[4] * x * (4 * zz - xx - yy); - result[..., 14] = SH_C3[5] * z * (xx - yy); - result[..., 15] = SH_C3[6] * x * (xx - 3 * yy); - - if basis_dim > 16: - result[..., 16] = SH_C4[0] * xy * (xx - yy); - result[..., 17] = SH_C4[1] * yz * (3 * xx - yy); - result[..., 18] = SH_C4[2] * xy * (7 * zz - 1); - result[..., 19] = SH_C4[3] * yz * (7 * zz - 3); - result[..., 20] = SH_C4[4] * (zz * (35 * zz - 30) + 3); - result[..., 21] = SH_C4[5] * xz * (7 * zz - 3); - result[..., 22] = SH_C4[6] * (xx - yy) * (7 * zz - 1); - result[..., 23] = SH_C4[7] * xz * (xx - 3 * yy); - result[..., 24] = SH_C4[8] * (xx * (xx - 3 * yy) - yy * (3 * xx - yy)); - return result diff --git a/diff_rast/diff_rast/project_gaussians.py b/diff_rast/project_gaussians.py similarity index 92% rename from diff_rast/diff_rast/project_gaussians.py rename to diff_rast/project_gaussians.py index 47153df6b..88ed6c327 100644 --- a/diff_rast/diff_rast/project_gaussians.py +++ b/diff_rast/project_gaussians.py @@ -6,7 +6,7 @@ from torch import Tensor from torch.autograd import Function -import cuda_lib # make sure to import torch before diff_rast +import diff_rast.cuda as _C class ProjectGaussians(Function): @@ -50,7 +50,7 @@ def forward( radii, conics, num_tiles_hit, - ) = cuda_lib.project_gaussians_forward( + ) = _C.project_gaussians_forward( num_points, means3d, scales, @@ -100,13 +100,7 @@ def backward(ctx, v_xys, v_depths, v_radii, v_conics, v_num_tiles_hit, v_cov3d): conics, ) = ctx.saved_tensors - ( - v_cov2d, - v_cov3d, - v_mean3d, - v_scale, - v_quat, - ) = cuda_lib.project_gaussians_backward( + (v_cov2d, v_cov3d, v_mean3d, v_scale, v_quat,) = _C.project_gaussians_backward( ctx.num_points, means3d, scales, diff --git a/diff_rast/diff_rast/rasterize.py b/diff_rast/rasterize.py similarity index 95% rename from diff_rast/diff_rast/rasterize.py rename to diff_rast/rasterize.py index 536b83368..e219c4d02 100644 --- a/diff_rast/diff_rast/rasterize.py +++ b/diff_rast/rasterize.py @@ -7,7 +7,7 @@ from torch import Tensor from torch.autograd import Function -import cuda_lib # make sure to import torch before diff_rast +import diff_rast.cuda as _C class RasterizeGaussians(Function): @@ -59,7 +59,7 @@ def forward( gaussian_ids_unsorted, isect_ids_sorted, isect_ids_unsorted, - ) = cuda_lib.rasterize_forward( + ) = _C.rasterize_forward( xys.contiguous().cuda(), depths.contiguous().cuda(), radii.contiguous().cuda(), @@ -105,7 +105,7 @@ def backward(ctx, v_out_img): final_idx, ) = ctx.saved_tensors - v_xy, v_conic, v_colors, v_opacity = cuda_lib.rasterize_backward( + v_xy, v_conic, v_colors, v_opacity = _C.rasterize_backward( img_height, img_width, gaussian_ids_sorted.contiguous().cuda(), @@ -132,5 +132,3 @@ def backward(ctx, v_out_img): None, # img_width None, # background ) - - diff --git a/diff_rast/setup.py b/diff_rast/setup.py deleted file mode 100644 index 78d204ed8..000000000 --- a/diff_rast/setup.py +++ /dev/null @@ -1,28 +0,0 @@ -import os - -from setuptools import setup -from torch.utils.cpp_extension import BuildExtension, CUDAExtension - -CSRC = os.path.abspath(f"{__file__}/../../csrc") - -setup( - name="diff_rast", - description=" Python package for differentiable rasterization of gaussians", - keywords="gaussian, splatting, cuda", - # package_dir = {'': '.'}, - packages=['diff_rast'], - ext_modules=[ - CUDAExtension( - name="cuda_lib", - sources=[ - f"{CSRC}/ext.cpp", - f"{CSRC}/rasterize.cu", - f"{CSRC}/bindings.cu", - f"{CSRC}/forward.cu", - f"{CSRC}/backward.cu", - ], - extra_compile_args={"nvcc": [f"-I {CSRC}/third_party/glm/"]}, - ), - ], - cmdclass={"build_ext": BuildExtension}, -) diff --git a/diff_rast/diff_rast/sh.py b/diff_rast/sh.py similarity index 82% rename from diff_rast/diff_rast/sh.py rename to diff_rast/sh.py index f97f60c0c..d70498f9d 100644 --- a/diff_rast/diff_rast/sh.py +++ b/diff_rast/sh.py @@ -1,7 +1,6 @@ """Python bindings for SH""" -import torch -import cuda_lib +import diff_rast.cuda as _C from jaxtyping import Float from torch import Tensor @@ -21,13 +20,14 @@ def num_sh_bases(degree: int): class SphericalHarmonics(Function): - """Compute spherical harmonics - + """Compute spherical harmonics + Args: degree (int): degree of SHs. viewdirs (Tensor): viewing directions. coeffs (Tensor): harmonic coefficients. """ + @staticmethod def forward( ctx, @@ -39,7 +39,7 @@ def forward( assert coeffs.shape[-2] == num_sh_bases(degree) ctx.degree = degree ctx.save_for_backward(viewdirs) - return cuda_lib.compute_sh_forward(num_points, degree, viewdirs, coeffs) + return _C.compute_sh_forward(num_points, degree, viewdirs, coeffs) @staticmethod def backward(ctx, v_colors: Float[Tensor, "*batch 3"]): @@ -49,5 +49,5 @@ def backward(ctx, v_colors: Float[Tensor, "*batch 3"]): return ( None, None, - cuda_lib.compute_sh_backward(num_points, degree, viewdirs, v_colors), + _C.compute_sh_backward(num_points, degree, viewdirs, v_colors), ) diff --git a/diff_rast/version.py b/diff_rast/version.py new file mode 100644 index 000000000..6c8e6b979 --- /dev/null +++ b/diff_rast/version.py @@ -0,0 +1 @@ +__version__ = "0.0.0" diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 000000000..41c270bb3 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 000000000..49a09e602 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,4 @@ +furo +sphinx +sphinx-copybutton +sphinx-design \ No newline at end of file diff --git a/docs/source/apis/proj.rst b/docs/source/apis/proj.rst new file mode 100644 index 000000000..b8c08b21b --- /dev/null +++ b/docs/source/apis/proj.rst @@ -0,0 +1,7 @@ +Projection +=================================== + +.. currentmodule:: diff_rast + +.. autoclass:: ProjectGaussians + :members: \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 000000000..7b682372c --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,42 @@ +__version__ = None +exec(open("../../diff_rast/version.py", "r").read()) + +# -- Project information + +project = "diff_rast" +copyright = "2023, Vickie" +author = "Vickie" + +release = __version__ + +# -- General configuration + +extensions = [ + "sphinx.ext.napoleon", + "sphinx.ext.duration", + "sphinx.ext.doctest", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.intersphinx", +] + +intersphinx_mapping = { + "python": ("https://docs.python.org/3/", None), + "sphinx": ("https://www.sphinx-doc.org/en/master/", None), +} +intersphinx_disabled_domains = ["std"] + +templates_path = ["_templates"] + +# -- Options for HTML output +html_theme = "furo" + +# Ignore >>> when copying code +copybutton_prompt_text = r">>> |\.\.\. " +copybutton_prompt_is_regexp = True + +# -- Options for EPUB output +epub_show_urls = "footnote" + +# typehints +autodoc_typehints = "description" diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 000000000..6c2b31bbf --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,13 @@ +DiffRast Documentation +=================================== + + +Links: +------------- + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Python API + + apis/* \ No newline at end of file diff --git a/tests/compare_reference.py b/examples/compare_reference.py similarity index 98% rename from tests/compare_reference.py rename to examples/compare_reference.py index ebb548997..d2e777bb9 100644 --- a/tests/compare_reference.py +++ b/examples/compare_reference.py @@ -113,7 +113,7 @@ def _init_gaussians(self): self.viewmat.requires_grad = False def forward_ours(self): - xys, depths, radii, conics, num_tiles_hit,cov3d = ProjectGaussians.apply( + xys, depths, radii, conics, num_tiles_hit, cov3d = ProjectGaussians.apply( self.means, self.scales, self.scale_mod, diff --git a/tests/ref_trainer.py b/examples/ref_trainer.py similarity index 99% rename from tests/ref_trainer.py rename to examples/ref_trainer.py index af1aade65..779f3d42c 100644 --- a/tests/ref_trainer.py +++ b/examples/ref_trainer.py @@ -27,7 +27,7 @@ def projection_matrix(znear, zfar, fovx, fovy, **kwargs): [0.0, 0.0, (f + n) / (f - n), -2.0 * f * n / (f - n)], [0.0, 0.0, 1.0, 0.0], ], - **kwargs + **kwargs, ) @@ -164,7 +164,6 @@ def forward(self): ) # (C, H, W) return out_color.permute(1, 2, 0) - def train(self, iterations: int = 1000, lr: float = 0.01, save_imgs: bool = True): optimizer = optim.Adam( [self.rgbs, self.means, self.scales, self.opacities, self.quats], lr diff --git a/tests/simple_trainer.py b/examples/simple_trainer.py similarity index 97% rename from tests/simple_trainer.py rename to examples/simple_trainer.py index 046b0bd33..5dd36fb7d 100644 --- a/tests/simple_trainer.py +++ b/examples/simple_trainer.py @@ -44,7 +44,7 @@ def _init_gaussians(self): self.scales = torch.empty((self.num_points, 3), device=self.device) self.quats = torch.empty((self.num_points, 4), device=self.device) self.rgbs = torch.ones((self.num_points, 3), device=self.device) - self.opacities = torch.ones((self.num_points,1), device=self.device) + self.opacities = torch.ones((self.num_points, 1), device=self.device) bd = 2 for i in range(self.num_points): self.means[i] = torch.tensor( @@ -144,13 +144,16 @@ def train(self, iterations: int = 1000, lr: float = 0.01, save_imgs: bool = True loop=0, ) + def image_path_to_tensor(image_path): import torchvision.transforms as transforms + img = Image.open(image_path) transform = transforms.ToTensor() - img_tensor = transform(img).permute(1,2,0) + img_tensor = transform(img).permute(1, 2, 0) return img_tensor + def main(height: int = 256, width: int = 256) -> None: gt_image = torch.ones((height, width, 3)) * 1.0 # make top left and bottom right red,blue diff --git a/tests/test_sh.py b/examples/test_sh.py similarity index 100% rename from tests/test_sh.py rename to examples/test_sh.py diff --git a/ref_rast/ref_rast/__init__.py b/ref_rast/ref_rast/__init__.py deleted file mode 100644 index f05b919e0..000000000 --- a/ref_rast/ref_rast/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -from .rasterize import _RasterizeGaussians, GaussianRasterizationSettings - -def rasterize_gaussians( - means3D, - means2D, - sh, - colors_precomp, - opacities, - scales, - rotations, - cov3Ds_precomp, - raster_settings, -): - return _RasterizeGaussians.apply( - means3D, - means2D, - sh, - colors_precomp, - opacities, - scales, - rotations, - cov3Ds_precomp, - raster_settings, - ) diff --git a/ref_rast/ref_rast/rasterize.py b/ref_rast/ref_rast/rasterize.py deleted file mode 100644 index 0892c6739..000000000 --- a/ref_rast/ref_rast/rasterize.py +++ /dev/null @@ -1,366 +0,0 @@ -# -# Copyright (C) 2023, Inria -# GRAPHDECO research group, https://team.inria.fr/graphdeco -# All rights reserved. -# -# This software is free for non-commercial, research and evaluation use -# under the terms of the LICENSE.md file. -# -# For inquiries contact george.drettakis@inria.fr -# - -from typing import NamedTuple -import torch.nn as nn -import torch -import ref_rast_cuda - - -def cpu_deep_copy_tuple(input_tuple): - copied_tensors = [ - item.cpu().clone() if isinstance(item, torch.Tensor) else item - for item in input_tuple - ] - return tuple(copied_tensors) - - -def rasterize_gaussians( - means3D, - means2D, - sh, - colors_precomp, - opacities, - scales, - rotations, - cov3Ds_precomp, - raster_settings, -): - return _RasterizeGaussians.apply( - means3D, - means2D, - sh, - colors_precomp, - opacities, - scales, - rotations, - cov3Ds_precomp, - raster_settings, - ) - - -class _RasterizeGaussians(torch.autograd.Function): - @staticmethod - def forward( - ctx, - means3D, - means2D, - sh, - colors_precomp, - opacities, - scales, - rotations, - cov3Ds_precomp, - raster_settings, - ): - # Restructure arguments the way that the C++ lib expects them - args = ( - raster_settings.bg, - means3D, - colors_precomp, - opacities, - scales, - rotations, - raster_settings.scale_modifier, - cov3Ds_precomp, - raster_settings.viewmatrix, - raster_settings.projmatrix, - raster_settings.tanfovx, - raster_settings.tanfovy, - raster_settings.image_height, - raster_settings.image_width, - sh, - raster_settings.sh_degree, - raster_settings.campos, - raster_settings.prefiltered, - raster_settings.debug, - ) - - # Invoke C++/CUDA rasterizer - if raster_settings.debug: - cpu_args = cpu_deep_copy_tuple( - args - ) # Copy them before they can be corrupted - try: - ( - num_rendered, - color, - radii, - geomBuffer, - binningBuffer, - imgBuffer, - ) = ref_rast_cuda.rasterize_gaussians(*args) - except Exception as ex: - torch.save(cpu_args, "snapshot_fw.dump") - print( - "\nAn error occured in forward. Please forward snapshot_fw.dump for debugging." - ) - raise ex - else: - ( - num_rendered, - color, - radii, - geomBuffer, - binningBuffer, - imgBuffer, - ) = ref_rast_cuda.rasterize_gaussians(*args) - - # Keep relevant tensors for backward - ctx.raster_settings = raster_settings - ctx.num_rendered = num_rendered - ctx.save_for_backward( - colors_precomp, - means3D, - scales, - rotations, - cov3Ds_precomp, - radii, - sh, - geomBuffer, - binningBuffer, - imgBuffer, - ) - return color, radii - - @staticmethod - def backward(ctx, grad_out_color, _): - # Restore necessary values from context - num_rendered = ctx.num_rendered - raster_settings = ctx.raster_settings - ( - colors_precomp, - means3D, - scales, - rotations, - cov3Ds_precomp, - radii, - sh, - geomBuffer, - binningBuffer, - imgBuffer, - ) = ctx.saved_tensors - - # Restructure args as C++ method expects them - args = ( - raster_settings.bg, - means3D, - radii, - colors_precomp, - scales, - rotations, - raster_settings.scale_modifier, - cov3Ds_precomp, - raster_settings.viewmatrix, - raster_settings.projmatrix, - raster_settings.tanfovx, - raster_settings.tanfovy, - grad_out_color, - sh, - raster_settings.sh_degree, - raster_settings.campos, - geomBuffer, - num_rendered, - binningBuffer, - imgBuffer, - raster_settings.debug, - ) - - # Compute gradients for relevant tensors by invoking backward method - if raster_settings.debug: - cpu_args = cpu_deep_copy_tuple( - args - ) # Copy them before they can be corrupted - try: - ( - grad_means2D, - grad_colors_precomp, - grad_opacities, - grad_means3D, - grad_cov3Ds_precomp, - grad_sh, - grad_scales, - grad_rotations, - ) = ref_rast_cuda.rasterize_gaussians_backward(*args) - except Exception as ex: - torch.save(cpu_args, "snapshot_bw.dump") - print( - "\nAn error occured in backward. Writing snapshot_bw.dump for debugging.\n" - ) - raise ex - else: - ( - grad_means2D, - grad_colors_precomp, - grad_opacities, - grad_means3D, - grad_cov3Ds_precomp, - grad_sh, - grad_scales, - grad_rotations, - ) = ref_rast_cuda.rasterize_gaussians_backward(*args) - - grads = ( - grad_means3D, - grad_means2D, - grad_sh, - grad_colors_precomp, - grad_opacities, - grad_scales, - grad_rotations, - grad_cov3Ds_precomp, - None, - ) - - return grads - - -class GaussianRasterizationSettings(NamedTuple): - image_height: int - image_width: int - tanfovx: float - tanfovy: float - bg: torch.Tensor - scale_modifier: float - viewmatrix: torch.Tensor - projmatrix: torch.Tensor - sh_degree: int - campos: torch.Tensor - prefiltered: bool - debug: bool - - -class GaussianRasterizer(nn.Module): - def __init__(self, raster_settings): - super().__init__() - self.raster_settings = raster_settings - - def markVisible(self, positions): - # Mark visible points (based on frustum culling for camera) with a boolean - with torch.no_grad(): - raster_settings = self.raster_settings - visible = ref_rast_cuda.mark_visible( - positions, raster_settings.viewmatrix, raster_settings.projmatrix - ) - - return visible - - def forward( - self, - means3D, - means2D, - opacities, - shs=None, - colors_precomp=None, - scales=None, - rotations=None, - cov3D_precomp=None, - ): - raster_settings = self.raster_settings - - if (shs is None and colors_precomp is None) or ( - shs is not None and colors_precomp is not None - ): - raise Exception( - "Please provide excatly one of either SHs or precomputed colors!" - ) - - if ((scales is None or rotations is None) and cov3D_precomp is None) or ( - (scales is not None or rotations is not None) and cov3D_precomp is not None - ): - raise Exception( - "Please provide exactly one of either scale/rotation pair or precomputed 3D covariance!" - ) - - if shs is None: - shs = torch.Tensor([]) - if colors_precomp is None: - colors_precomp = torch.Tensor([]) - - if scales is None: - scales = torch.Tensor([]) - if rotations is None: - rotations = torch.Tensor([]) - if cov3D_precomp is None: - cov3D_precomp = torch.Tensor([]) - - # Invoke C++/CUDA rasterization routine - return rasterize_gaussians( - means3D, - means2D, - shs, - colors_precomp, - opacities, - scales, - rotations, - cov3D_precomp, - raster_settings, - ) - - -if __name__ == "__main__": - import imageio - - device = torch.device("cuda:0") - num_points = 256 - n = 16 - # means3d = torch.randn((num_points, 3), device=device, requires_grad=True) - linspace = torch.linspace(-1, 1, n, device=device) - x, y = torch.meshgrid(linspace, linspace) - z = torch.linspace(0, 1, num_points, device=device) - print(x.shape, y.shape, z.shape) - means3d = torch.stack([x.flatten(), y.flatten(), z], dim=-1) - print(means3d.shape) - means2d = torch.tensor([]) - sh = torch.Tensor([]) - opacities = torch.ones((num_points, 1), device=device) - colors = torch.rand((num_points, 3), device=device) - covs3d = torch.Tensor([]) - # scales = torch.rand((num_points, 3), device=device) - scales = torch.ones((num_points, 3), device=device) - quats = torch.randn((num_points, 4), device=device) - quats /= torch.linalg.norm(quats, dim=-1, keepdim=True) - - # expects view matrix be column major - viewmat = torch.eye(4, device=device) - viewmat[3, 2] = 2 - # projmat = torch.eye(4, device=device) - projmat = viewmat - - fx = 3.0 - fy = 3.0 - img_height = 256 - img_width = 256 - tanfovx = 0.5 * img_width / fx - tanfovy = 0.5 * img_height / fy - bg = torch.ones(3, device=device) - scale_modifier = 1.0 - - settings = GaussianRasterizationSettings( - img_height, - img_width, - tanfovx, - tanfovy, - bg, - scale_modifier, - viewmat, - viewmat, - sh_degree=0, - campos=torch.zeros(3, device=device), - prefiltered=False, - debug=True, - ) - - out_color, radii = rasterize_gaussians( - means3d, means2d, sh, colors, opacities, scales, quats, covs3d, settings - ) - img = (255 * out_color.detach().cpu().permute(1, 2, 0)).byte() - imageio.imwrite("test_reference_forward.png", img) diff --git a/ref_rast/setup.py b/ref_rast/setup.py deleted file mode 100644 index 0269cc2d0..000000000 --- a/ref_rast/setup.py +++ /dev/null @@ -1,27 +0,0 @@ -import os - -from setuptools import setup -from torch.utils.cpp_extension import BuildExtension, CUDAExtension - -CSRC = os.path.abspath(f"{__file__}/../../csrc") - -setup( - name="ref_rast", - packages=["ref_rast"], - description="reference package for differentiable rasterization of gaussians", - keywords="gaussian, splatting, cuda", - ext_modules=[ - CUDAExtension( - name="ref_rast_cuda", - sources=[ - f"{CSRC}/reference/cuda_rasterizer/rasterizer_impl.cu", - f"{CSRC}/reference/cuda_rasterizer/forward.cu", - f"{CSRC}/reference/cuda_rasterizer/backward.cu", - f"{CSRC}/reference/rasterize_points.cu", - f"{CSRC}/reference/ext.cpp", - ], - extra_compile_args={"nvcc": [f"-I {CSRC}/third_party/glm/"]}, - ), - ], - cmdclass={"build_ext": BuildExtension}, -) diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..1d3cf24bb --- /dev/null +++ b/setup.py @@ -0,0 +1,132 @@ +import glob +import os +import os.path as osp +import platform +import sys + +from setuptools import find_packages, setup + +__version__ = None +exec(open("diff_rast/version.py", "r").read()) + +URL = "" # TODO + +BUILD_NO_CUDA = os.getenv("BUILD_NO_CUDA", "0") == "1" +WITH_SYMBOLS = os.getenv("WITH_SYMBOLS", "0") == "1" + + +def get_ext(): + from torch.utils.cpp_extension import BuildExtension + + return BuildExtension.with_options(no_python_abi_suffix=True, use_ninja=False) + + +def get_extensions(): + import torch + from torch.__config__ import parallel_info + from torch.utils.cpp_extension import CUDAExtension + + extensions_dir = osp.join("diff_rast", "cuda", "csrc") + sources = glob.glob(osp.join(extensions_dir, "*.cu")) + glob.glob( + osp.join(extensions_dir, "*.cpp") + ) + # sources = [ + # osp.join(extensions_dir, "ext.cpp"), + # osp.join(extensions_dir, "rasterize.cu"), + # osp.join(extensions_dir, "bindings.cu"), + # osp.join(extensions_dir, "forward.cu"), + # osp.join(extensions_dir, "backward.cu"), + # ] + # remove generated 'hip' files, in case of rebuilds + sources = [path for path in sources if "hip" not in path] + + undef_macros = [] + define_macros = [] + + if sys.platform == "win32": + define_macros += [("diff_rast_EXPORTS", None)] + + extra_compile_args = {"cxx": ["-O3"]} + if not os.name == "nt": # Not on Windows: + extra_compile_args["cxx"] += ["-Wno-sign-compare"] + extra_link_args = [] if WITH_SYMBOLS else ["-s"] + + info = parallel_info() + if ( + "backend: OpenMP" in info + and "OpenMP not found" not in info + and sys.platform != "darwin" + ): + extra_compile_args["cxx"] += ["-DAT_PARALLEL_OPENMP"] + if sys.platform == "win32": + extra_compile_args["cxx"] += ["/openmp"] + else: + extra_compile_args["cxx"] += ["-fopenmp"] + else: + print("Compiling without OpenMP...") + + # Compile for mac arm64 + if sys.platform == "darwin" and platform.machine() == "arm64": + extra_compile_args["cxx"] += ["-arch", "arm64"] + extra_link_args += ["-arch", "arm64"] + + nvcc_flags = os.getenv("NVCC_FLAGS", "") + nvcc_flags = [] if nvcc_flags == "" else nvcc_flags.split(" ") + nvcc_flags += ["-O3"] + if torch.version.hip: + # USE_ROCM was added to later versions of PyTorch. + # Define here to support older PyTorch versions as well: + define_macros += [("USE_ROCM", None)] + undef_macros += ["__HIP_NO_HALF_CONVERSIONS__"] + else: + nvcc_flags += ["--expt-relaxed-constexpr"] + extra_compile_args["nvcc"] = nvcc_flags + + extension = CUDAExtension( + f"diff_rast.csrc", + sources, + include_dirs=[osp.join(extensions_dir, "third_party", "glm")], + define_macros=define_macros, + undef_macros=undef_macros, + extra_compile_args=extra_compile_args, + extra_link_args=extra_link_args, + ) + + return [extension] + + +setup( + name="diff_rast", + version=__version__, + description=" Python package for differentiable rasterization of gaussians", + keywords="gaussian, splatting, cuda", + url=URL, + download_url=f"{URL}/archive/{__version__}.tar.gz", + python_requires=">=3.7", + install_requires=[ + "jaxtyping", + "rich>=12", + "torch", + "typing_extensions; python_version<'3.8'", + ], + extras_require={ + # dev dependencies. Install them by `pip install diff_rast[dev]` + "dev": [ + "black[jupyter]==22.3.0", + "isort==5.10.1", + "pylint==2.13.4", + "pytest==7.1.2", + "pytest-xdist==2.5.0", + "typeguard>=2.13.3", + "pyyaml==6.0", + "build", + "twine", + "ninja", + ], + }, + ext_modules=get_extensions() if not BUILD_NO_CUDA else [], + cmdclass={"build_ext": get_ext()} if not BUILD_NO_CUDA else {}, + packages=find_packages(), + # https://github.com/pypa/setuptools/issues/1461#issuecomment-954725244 + include_package_data=True, +) diff --git a/tests/test_bindings_forward.py b/tests/test_bindings_forward.py deleted file mode 100644 index d3372ab1a..000000000 --- a/tests/test_bindings_forward.py +++ /dev/null @@ -1,109 +0,0 @@ -"""Test against ref bindings - -Make sure you have the ref bindings installed: - - install ref bindings: cd ref_rast && pip install -e . -""" - - -import torch -import imageio - -from diff_rast.rasterize import RasterizeGaussians -from diff_rast.project_gaussians import ProjectGaussians - -from ref_rast import GaussianRasterizationSettings, rasterize_gaussians - -def test_bindings_forward(save_img=False): - means3d, sh, opacities, colors, covs3d, scales, quats, viewmat, projmat = _init_gaussians() - - out_color = _run_ref( - means3d.clone(), - sh.clone(), - opacities.clone(), - colors.clone(), - covs3d.clone(), - scales.clone(), - quats.clone(), - viewmat.clone(), - projmat.clone(), - ) - ref_img = (255 * out_color.detach().cpu()).byte() - imageio.imwrite("test_reference_forward.png", ref_img) - - color = _run_diff_rast(means3d, colors, opacities, scales, quats, viewmat, projmat) - img = (255 * color.detach().cpu()).byte() - imageio.imwrite("test_diff_forward.png", img) - - torch.testing.assert_close(color, out_color) - - -def _run_ref(means3D, sh, opacities, colors, covs3d, scales, quats, viewmat, projmat): - settings = GaussianRasterizationSettings( - img_height, - img_width, - tanfovx, - tanfovy, - bg, - scale_modifier, - viewmat, - projmat, - sh_degree=0, - campos=viewmat[:3, 3], - prefiltered=False, - debug=True, - ) - - out_color, radii = rasterize_gaussians( - means3D.clone(), None, sh, colors.clone(), opacities.clone(), scales.clone(), quats.clone(), covs3d, settings - ) - - return out_color.permute(1, 2, 0) - - -def _run_diff_rast(means, rgbs, opacities, scales, quats, viewmat, projmat): - xys, depths, radii, conics, num_tiles_hit = ProjectGaussians.apply( - means, scales, scale_modifier, quats, viewmat, projmat, fx, fy, img_height, img_width, TILE_BOUNDS - ) - out_img = RasterizeGaussians.apply(xys, depths, radii, conics, num_tiles_hit, rgbs, opacities, img_height, img_width) - - return out_img - - -def _init_gaussians(): - means3d = torch.randn((num_points, 3), device=device) - sh = torch.Tensor([]).to(device) - opacities = torch.ones((num_points, 1), device=device) * 0.9 - colors = torch.rand((num_points, 3), device=device) - covs3d = torch.Tensor([]).to(device) - scales = torch.rand((num_points, 3), device=device) - quats = torch.randn((num_points, 4), device=device) - quats /= torch.linalg.norm(quats, dim=-1, keepdim=True) - - viewmat = torch.eye(4, device=device) - - projmat = viewmat - - means3d.requires_grad = True - scales.requires_grad = True - quats.requires_grad = True - colors.requires_grad = True - opacities.requires_grad = True - - return means3d, sh, opacities, colors, covs3d, scales, quats, viewmat, projmat - - -if __name__ == "__main__": - device = torch.device("cuda:0") - num_points = 100 - - fx = 3.0 - fy = 3.0 - img_height = 256*2 - img_width = 256 - tanfovx = 0.5 * img_width / fx - tanfovy = 0.5 * img_height / fy - bg = torch.ones(3, device=device) - scale_modifier = 1.0 - TILE_BOUNDS = (img_width + 16 - 1) // 16, (img_height + 16 - 1) // 16, 1 - - test_bindings_forward(save_img=True) diff --git a/tests/test_proj.py b/tests/test_proj.py deleted file mode 100644 index 683eab68f..000000000 --- a/tests/test_proj.py +++ /dev/null @@ -1,85 +0,0 @@ -import pytest -import torch - -device = torch.device("cuda:0") - - -@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device") -def test_project_gaussians_forward(): - from diff_rast import _torch_impl - import cuda_lib - - torch.manual_seed(42) - - num_points = 1_000_000 - means3d = torch.randn((num_points, 3), device=device, requires_grad=True) - scales = torch.randn((num_points, 3), device=device) - glob_scale = 0.3 - quats = torch.randn((num_points, 4), device=device) - quats /= torch.linalg.norm(quats, dim=-1, keepdim=True) - viewmat = torch.eye(4, device=device) - projmat = torch.eye(4, device=device) - fx, fy = 3.0, 3.0 - H, W = 512, 512 - - BLOCK_X, BLOCK_Y = 16, 16 - tile_bounds = (W + BLOCK_X - 1) // BLOCK_X, (H + BLOCK_Y - 1) // BLOCK_Y, 1 - - ( - cov3d, - xys, - depths, - radii, - conics, - num_tiles_hit, - ) = cuda_lib.project_gaussians_forward( - num_points, - means3d, - scales, - glob_scale, - quats, - viewmat, - projmat, - fx, - fy, - (H, W), # TODO(ruilong): Is this HW or WH? - tile_bounds, - ) - - ( - _cov3d, - _xys, - _depths, - _radii, - _conics, - _num_tiles_hit, - _masks, - ) = _torch_impl.project_gaussians_forward( - means3d, - scales, - glob_scale, - quats, - viewmat, - projmat, - fx, - fy, - (H, W), - tile_bounds, - ) - - torch.testing.assert_close( - cov3d[_masks], - _cov3d.view(-1, 9)[_masks][:, [0, 1, 2, 4, 5, 8]], - atol=1e-5, - rtol=1e-5, - ) - torch.testing.assert_close(xys[_masks], _xys[_masks]) - torch.testing.assert_close(depths[_masks], _depths[_masks]) - torch.testing.assert_close(radii[_masks], _radii[_masks]) - torch.testing.assert_close(conics[_masks], _conics[_masks]) - torch.testing.assert_close(num_tiles_hit[_masks], _num_tiles_hit[_masks]) - - - -if __name__ == "__main__": - test_project_gaussians_forward()