diff --git a/.github/workflows/core_tests.yml b/.github/workflows/core_tests.yml
new file mode 100644
index 000000000..3e8e62176
--- /dev/null
+++ b/.github/workflows/core_tests.yml
@@ -0,0 +1,26 @@
+name: Core Tests.
+
+on:
+  push:
+    branches: [master]
+  pull_request:
+    branches: [master]
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python 3.8.12
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8.12"
+      - name: Install dependencies
+        run: |
+          pip install isort==5.10.1 black[jupyter]==22.3.0
+      - name: Run Black Format Check
+        run: black . diff_rast/ tests/ examples/ --check
diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml
new file mode 100644
index 000000000..af384ce8f
--- /dev/null
+++ b/.github/workflows/doc.yml
@@ -0,0 +1,35 @@
+name: Docs
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+
+permissions:
+  contents: write
+jobs:
+  docs:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+          python-version: '3.9'
+      - name: Install dependencies
+        run: |
+          pip install -r docs/requirements.txt
+          pip install torch==2.0.0 --index-url https://download.pytorch.org/whl/cpu
+          BUILD_NO_CUDA=1 pip install .
+      - name: Sphinx build
+        # fail on warnings: "-W --keep-going"
+        run: |
+          sphinx-build docs/source _build
+      - name: Deploy
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          publish_branch: gh-pages
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: _build/
+          force_orphan: true
+        #   cname: ...
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index ddc3d2fde..548d2ddf4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -117,12 +117,6 @@ venv.bak/
 # line_profiler
 *.lprof
 
-# vscode
-.vsocde
-
 *build
 compile_commands.json
-
-*.png
-*.txt
-*.dump
+*.dump
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
index d3f25e6ad..c62722305 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "csrc/third_party/glm"]
-	path = csrc/third_party/glm
-	url = https://github.com/g-truc/glm.git
+[submodule "diff_rast/cuda/csrc/third_party/glm"]
+	path = diff_rast/cuda/csrc/third_party/glm
+	url = https://github.com/g-truc/glm.git
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 000000000..97db421e3
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include dir-pattern diff_rast/cuda/csrc
\ No newline at end of file
diff --git a/README.md b/README.md
index 459e27548..b9f9efcd2 100644
--- a/README.md
+++ b/README.md
@@ -2,18 +2,25 @@
 
 Our version of differentiable gaussian rasterizer
 
-# ref_rast
+# Installation
 
-Copied official version of differentiable gaussian rasterizer
+For CUDA development, it is recommend to install with `BUILD_NO_CUDA=1`, which
+will disable compiling during pip install, and instead use JIT compiling on your
+first run. The benefit of JIT compiling is that it does incremental compiling as
+you modify your cuda code so it is much faster than re-compile through pip. Note
+the JIT compiled library can be found under `~/.cache/torch_extensions/py*-cu*/`.
 
-# Installation
+```
+BUILD_NO_CUDA=1 pip install -e .[dev]
+```
+
+If you won't touch the underlying CUDA code, you can just install with compiling:
 
 ```
-python3 -m pip install --upgrade pip
-cd diff_rast; pip install -e .
-cd ../ref_rast; pip install -e .
+pip install -e .[dev]
 ```
 
+
 # Brief walkthrough
 
 The main python bindings for rasterization are found by importing diff_rast
@@ -23,13 +30,6 @@ import diff_rast
 help(diff_rast)
 ```
 
-Additional supported cuda functions are found by importing cuda_lib
-
-```
-import cuda_lib
-help(cuda_lib)
-```
-
 # clangd setup (for Neovim)
 
 [clangd](https://clangd.llvm.org/) is a nice tool for providing completions,
diff --git a/csrc/check_serial_backward.cu b/csrc/check_serial_backward.cu
deleted file mode 100644
index e391ccf25..000000000
--- a/csrc/check_serial_backward.cu
+++ /dev/null
@@ -1,285 +0,0 @@
-#include <cstdlib>
-#include <cstring>
-#include <ctime>
-#include <iomanip>
-#include <iostream>
-#include <math.h>
-#include <vector>
-
-#include "backward.cuh"
-#include "forward.cuh"
-#include "helpers.cuh"
-#include "serial_backward.cuh"
-
-#include "debug_utils.h"
-
-float random_float() { return (float)std::rand() / RAND_MAX; }
-
-float4 random_quat() {
-    float u = random_float();
-    float v = random_float();
-    float w = random_float();
-    return {
-        sqrt(1.f - u) * sin(2.f * (float)M_PI * v),
-        sqrt(1.f - u) * cos(2.f * (float)M_PI * v),
-        sqrt(u) * sin(2.f * (float)M_PI * w),
-        sqrt(u) * cos(2.f * (float)M_PI * w)};
-}
-
-float3 compute_conic(const float3 cov2d) {
-    float det = cov2d.x * cov2d.z - cov2d.y * cov2d.y;
-    if (det == 0.f)
-        return {0.f, 0.f, 0.f};
-    float inv_det = 1.f / det;
-    float3 conic;
-    conic.x = cov2d.z * inv_det;
-    conic.y = -cov2d.y * inv_det;
-    conic.z = cov2d.x * inv_det;
-    return conic;
-}
-
-void compare_project2d_mean_backward(int check_index) {
-    float3 mean = {random_float(), random_float(), random_float()};
-    // clang-format off
-    float proj[] = {
-        1.f, 0.f, 0.f, 0.f,
-        0.f, 1.f, 0.f, 0.f,
-        0.f, 0.f, 1.f, 0.f,
-        0.f, 0.f, 0.f, 1.f
-    };
-    // clang-format on
-    float2 dL_dmean2d = {random_float(), random_float()};
-
-    float3 dL_dmean = project_pix_vjp(proj, mean, (dim3){2, 2, 1}, dL_dmean2d);
-    float3 dL_dmean_ref = projectMean2DBackward(mean, proj, dL_dmean2d);
-
-    // comparison
-    const std::vector<std::pair<float, float>> dmean_data{
-        {dL_dmean.x, dL_dmean_ref.x},
-        {dL_dmean.y, dL_dmean_ref.y},
-        {dL_dmean.z, dL_dmean_ref.z},
-    };
-    print_errors(dmean_data, "dmean (project2d)", check_index);
-}
-
-void compare_conic_backward(int check_index) {
-    float3 cov2d = {random_float(), random_float(), random_float()};
-    float3 conic = compute_conic(cov2d);
-    float3 dL_dconic = {random_float(), random_float(), random_float()};
-    float3 dL_dcov2d;
-    cov2d_to_conic_vjp(conic, dL_dconic, dL_dcov2d);
-    float3 dL_dcov2d_ref = computeConicBackward(cov2d, dL_dconic);
-
-    // comparison
-    const std::vector<std::pair<float, float>> dcov2d_data{
-        {dL_dcov2d.x, dL_dcov2d_ref.x},
-        {dL_dcov2d.y, dL_dcov2d_ref.y},
-        {dL_dcov2d.z, dL_dcov2d_ref.z},
-    };
-    print_errors(dcov2d_data, "dcov2d (conic)", check_index);
-}
-
-void compare_cov3d_backward(int check_index) {
-    float3 scale = {random_float(), random_float(), random_float()};
-    float4 quat = random_quat();
-    float4 quat_ref = quat;
-    // float4 quat_ref = {quat.w, quat.x, quat.y, quat.z};
-    float dL_dcov3d[] = {
-        random_float(),
-        random_float(),
-        random_float(),
-        random_float(),
-        random_float(),
-        random_float()};
-    float3 dL_ds = {0.f, 0.f, 0.f};
-    float4 dL_dq = {0.f, 0.f, 0.f, 0.f};
-    scale_rot_to_cov3d_vjp(scale, 1.f, quat, dL_dcov3d, dL_ds, dL_dq);
-
-    float3 dL_ds_ref = {0.f, 0.f, 0.f};
-    float4 dL_dq_ref = {0.f, 0.f, 0.f, 0.f};
-    computeCov3DBackward(scale, 1.f, quat_ref, dL_dcov3d, dL_ds_ref, dL_dq_ref);
-
-    // comparison
-    const std::vector<std::pair<float, float>> ds_data{
-        {dL_ds.x, dL_ds_ref.x},
-        {dL_ds.y, dL_ds_ref.y},
-        {dL_ds.z, dL_ds_ref.z},
-    };
-    print_errors(ds_data, "ds (cov3d)", check_index);
-
-    const std::vector<std::pair<float, float>> dquat_data{
-        {dL_dq.x, dL_dq_ref.x},
-        {dL_dq.y, dL_dq_ref.y},
-        {dL_dq.z, dL_dq_ref.z},
-        {dL_dq.w, dL_dq_ref.w},
-    };
-    print_errors(dquat_data, "dquat (cov3d)", check_index);
-}
-
-void compare_cov2d_ewa_backward(int check_index) {
-    float3 mean = {random_float(), random_float(), random_float()};
-    float3 scale = {random_float(), random_float(), random_float()};
-    float4 quat = random_quat();
-    float cov3d[] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-    scale_rot_to_cov3d(scale, 1.f, quat, cov3d);
-    float3 dL_dcov2d = {random_float(), random_float(), random_float()};
-    float3 dL_dmean = {0.f, 0.f, 0.f};
-    float3 dL_dmean_ref = {0.f, 0.f, 0.f};
-    float dL_dcov[] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-    float dL_dcov_ref[] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-
-    // functions expect different view matrix convention
-    // clang-format off
-    float viewmat[] = {
-        1.f, 0.f, 0.f, 0.f,
-        0.f, 1.f, 0.f, 0.f,
-        0.f, 0.f, 1.f, 10.f,
-        0.f, 0.f, 0.f, 1.f
-    };
-    float viewmat_ref[] = {
-        1.f, 0.f, 0.f, 0.f,
-        0.f, 1.f, 0.f, 0.f,
-        0.f, 0.f, 1.f, 0.f,
-        0.f, 0.f, 10.f, 1.f
-    };
-    // clang-format on
-    computeCov2DBackward(
-        mean,
-        cov3d,
-        viewmat_ref,
-        1.f,
-        1.f,
-        1.f,
-        1.f,
-        dL_dcov2d,
-        dL_dmean_ref,
-        dL_dcov_ref
-    );
-    project_cov3d_ewa_vjp(
-        mean, cov3d, viewmat, 1.f, 1.f, dL_dcov2d, dL_dmean, dL_dcov
-    );
-
-    // comparison
-    const std::vector<std::pair<float, float>> dmean_data{
-        {dL_dmean.x, dL_dmean_ref.x},
-        {dL_dmean.x, dL_dmean_ref.x},
-        {dL_dmean.x, dL_dmean_ref.x},
-    };
-    print_errors(dmean_data, "dmean (cov2d_ewa)", check_index);
-
-    std::vector<std::pair<float, float>> dcov_data;
-    for (int i = 0; i < 6; ++i)
-        dcov_data.push_back({dL_dcov[i], dL_dcov_ref[i]});
-
-    print_errors(dcov_data, "dcov (cov2d_ewa)", check_index);
-}
-
-void compare_rasterize_backward(const int N, int check_index) {
-    float2 p = {0.f, 0.f};
-    float T_final = 5e-3;
-    constexpr int C = 3;
-    float dL_dout[C];
-    for (int i = 0; i < C; ++i) {
-        dL_dout[i] = random_float();
-    }
-
-    float2 xys[N];
-    float3 conics[N];
-    float opacities[N];
-    float4 conics_o[N];
-    float rgbs[C * N];
-    for (int i = 0; i < N; ++i) {
-        float v = (float)i - (float)N * 0.5f;
-        xys[i] = {0.1f * v, 0.1f * v};
-        conics[i] = {10.f, 0.f, 10.f};
-        opacities[i] = 0.5f;
-        conics_o[i] = {conics[i].x, conics[i].y, conics[i].z, opacities[i]};
-        for (int c = 0; c < C; ++c) {
-            rgbs[C * i + c] = random_float();
-        }
-    }
-    float dL_drgb[C * N] = {0.f};
-    float dL_drgb_ref[C * N] = {0.f};
-    float dL_do[N] = {0.f};
-    float dL_do_ref[N] = {0.f};
-    float2 dL_dm[N] = {0.f};
-    float2 dL_dm_ref[N] = {0.f};
-    float3 dL_dc[N] = {0.f};
-    float3 dL_dc_ref[N] = {0.f};
-
-    rasterize_vjp(
-        N,
-        p,
-        C,
-        xys,
-        conics,
-        opacities,
-        rgbs,
-        T_final,
-        dL_dout,
-        dL_drgb,
-        dL_do,
-        dL_dm,
-        dL_dc
-    );
-    rasterizeBackward(
-        N,
-        2,
-        2,
-        p,
-        xys,
-        conics_o,
-        rgbs,
-        T_final,
-        dL_dout,
-        dL_drgb_ref,
-        dL_do_ref,
-        dL_dm_ref,
-        dL_dc_ref
-    );
-
-    // comparison
-    std::vector<std::pair<float, float>> drgb_data;
-    for (int i = 0; i < C * N; ++i)
-        drgb_data.push_back({dL_drgb[i], dL_drgb_ref[i]});
-    print_errors(drgb_data, "drgb (rasterize)", check_index);
-
-    std::vector<std::pair<float, float>> do_data;
-    for (int i = 0; i < N; ++i)
-        do_data.push_back({dL_do[i], dL_do_ref[i]});
-    print_errors(do_data, "do (rasterize)", check_index);
-
-    std::vector<std::pair<float, float>> dm_data;
-    for (int i = 0; i < N; ++i) {
-        dm_data.push_back({dL_dm[i].x, dL_dm_ref[i].x});
-        dm_data.push_back({dL_dm[i].y, dL_dm_ref[i].y});
-    }
-    print_errors(dm_data, "dm (rasterize)", check_index);
-
-    std::vector<std::pair<float, float>> dc_data;
-    for (int i = 0; i < N; ++i) {
-        dc_data.push_back({dL_dc[i].x, dL_dc_ref[i].x});
-        dc_data.push_back({dL_dc[i].y, dL_dc_ref[i].y});
-        dc_data.push_back({dL_dc[i].z, dL_dc_ref[i].z});
-    }
-    print_errors(dc_data, "dc (rasterize)", check_index);
-}
-
-int main(int argc, char *argv[]) {
-    int num_iters = 1000;
-    if (argc > 1) {
-        num_iters = std::stoi(argv[1]);
-    }
-    int num_points = 8;
-    if (argc > 2) {
-        num_points = std::stoi(argv[2]);
-    }
-    for (int x = 0; x < num_iters; x++) {
-        compare_project2d_mean_backward(x);
-        compare_conic_backward(x);
-        compare_cov3d_backward(x);
-        compare_cov2d_ewa_backward(x);
-        compare_rasterize_backward(num_points, x);
-    }
-    return 0;
-}
diff --git a/csrc/check_serial_forward.cu b/csrc/check_serial_forward.cu
deleted file mode 100644
index 54aa8d651..000000000
--- a/csrc/check_serial_forward.cu
+++ /dev/null
@@ -1,482 +0,0 @@
-#include <cstdlib>
-#include <cstring>
-#include <ctime>
-#include <iomanip>
-#include <iostream>
-#include <math.h>
-#include <vector>
-
-#include "forward.cuh"
-#include "helpers.cuh"
-
-#include "reference/cuda_rasterizer/forward.cu"
-
-float random_float() { return (float)std::rand() / RAND_MAX; }
-
-float4 random_quat() {
-    float u = random_float();
-    float v = random_float();
-    float w = random_float();
-    return {
-        sqrt(1.f - u) * sin(2.f * (float)M_PI * v),
-        sqrt(1.f - u) * cos(2.f * (float)M_PI * v),
-        sqrt(u) * sin(2.f * (float)M_PI * w),
-        sqrt(u) * cos(2.f * (float)M_PI * w)};
-}
-
-// Host version of ref preprocessCUDA
-__host__ void host_preprocessCUDA(
-    int P,
-    const float *orig_points,
-    const glm::vec3 *scales,
-    const float scale_modifier,
-    const glm::vec4 *rotations,
-    const float *opacities,
-    const float *shs,
-    bool *clamped,
-    const float *cov3D_precomp,
-    const float *colors_precomp,
-    const float *viewmatrix,
-    const float *projmatrix,
-    const glm::vec3 *cam_pos,
-    const int W,
-    int H,
-    const float tan_fovx,
-    float tan_fovy,
-    const float focal_x,
-    float focal_y,
-    int *radii,
-    float2 *points_xy_image,
-    float *depths,
-    float *cov3Ds,
-    float *rgb,
-    float4 *conic_opacity,
-    const dim3 grid, // tile_bounds
-    uint32_t *tiles_touched
-) {
-    auto idx = 0;
-    if (idx >= P)
-        return;
-    int C = 3;
-    // Initialize radius and touched tiles to 0. If this isn't changed,
-    // this Gaussian will not be processed further.
-    radii[idx] = 0;
-    tiles_touched[idx] = 0;
-
-    // Perform near culling, quit if outside.
-    float3 p_view;
-    if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, false, p_view))
-        return;
-
-    // printf("ref: p_view %d %.2f %.2f %.2f\n", idx, p_view.x, p_view.y,
-    // p_view.z);
-
-    // Transform point by projecting
-    float3 p_orig = {
-        orig_points[3 * idx],
-        orig_points[3 * idx + 1],
-        orig_points[3 * idx + 2]};
-    float4 p_hom = transformPoint4x4(p_orig, projmatrix);
-    float p_w = 1.0f / (p_hom.w + 0.0000001f);
-    float3 p_proj = {p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w};
-    // printf("reff rast: p_proj %.2f %.2f \n", p_proj.x, p_proj.y);
-
-    // If 3D covariance matrix is precomputed, use it, otherwise compute
-    // from scaling and rotation parameters.
-    const float *cov3D;
-    if (cov3D_precomp != nullptr) {
-        cov3D = cov3D_precomp + idx * 6;
-    } else {
-        computeCov3D(
-            scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6
-        );
-        cov3D = cov3Ds + idx * 6;
-    }
-    // Compute 2D screen-space covariance matrix
-    float3 cov = computeCov2D(
-        p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D, viewmatrix
-    );
-    // printf("ref: cov2d %d, %.2f %.2f %.2f\n", idx, cov.x, cov.y, cov.z);
-
-    // Invert covariance (EWA algorithm)
-    float det = (cov.x * cov.z - cov.y * cov.y);
-    if (det == 0.0f)
-        return;
-    float det_inv = 1.f / det;
-    float3 conic = {cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv};
-    // printf("ref: conic %d %.2f %.2f %.2f\n", idx, conic.x, conic.y, conic.z);
-
-    // Compute extent in screen space (by finding eigenvalues of
-    // 2D covariance matrix). Use extent to compute a bounding rectangle
-    // of screen-space tiles that this Gaussian overlaps with. Quit if
-    // rectangle covers 0 tiles.
-    float mid = 0.5f * (cov.x + cov.z);
-    float lambda1 = mid + sqrt(max(0.1f, mid * mid - det));
-    float lambda2 = mid - sqrt(max(0.1f, mid * mid - det));
-    float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2)));
-    float2 point_image = {ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H)};
-    // printf("ref: point_image %d %.2f %.2f \n", idx, point_image.x,
-    // point_image.y);
-    uint2 rect_min, rect_max;
-    // printf("grid %d %d \n", grid.x, grid.y);
-    getRect(point_image, my_radius, rect_min, rect_max, grid);
-    if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0)
-        // return;
-
-        // If colors have been precomputed, use them, otherwise convert
-        // spherical harmonics coefficients to RGB color.
-        if (colors_precomp == nullptr) {
-            // glm::vec3 result = computeColorFromSH(idx, D, M,
-            // (glm::vec3*)orig_points, *cam_pos, shs, clamped); rgb[idx * C +
-            // 0] = result.x; rgb[idx * C + 1] = result.y; rgb[idx * C + 2] =
-            // result.z;
-        }
-
-    // Store some useful helper data for the next steps.
-    depths[idx] = p_view.z;
-    radii[idx] = my_radius;
-    points_xy_image[idx] = point_image;
-    // Inverse 2D covariance and opacity neatly pack into one float4
-    // conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] };
-    tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x);
-    int32_t tile_area = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x);
-    printf(
-        "ref rast: point %d x %.2f y %.2f z %.2f, radius %d, # tiles %d, "
-        "tile_min %d %d, tile_max %d %d \n",
-        idx,
-        point_image.x,
-        point_image.y,
-        depths[idx],
-        radii[idx],
-        tile_area,
-        rect_min.x,
-        rect_min.y,
-        rect_max.x,
-        rect_max.y
-    );
-}
-
-// Host version of diff_rast projection kernel
-__host__ __device__ void host_project_gaussians_forward_kernel(
-    const int num_points,
-    const float3 *means3d,
-    const float3 *scales,
-    const float glob_scale,
-    const float4 *quats,
-    const float *viewmat,
-    const float *projmat,
-    const float fx,
-    const float fy,
-    const dim3 img_size,
-    const dim3 tile_bounds,
-    float *covs3d,
-    float2 *xys,
-    float *depths,
-    int *radii,
-    float3 *conics,
-    int32_t *num_tiles_hit
-) {
-    // unsigned idx = cg::this_grid().thread_rank(); // idx of thread within
-    // grid
-    int idx = 0;
-    radii[idx] = 0;
-    num_tiles_hit[idx] = 0;
-    if (idx >= num_points) {
-        return;
-    }
-
-    float3 p_world = means3d[idx];
-    // printf("diff rast: p_world %d %.2f %.2f %.2f\n", idx, p_world.x,
-    // p_world.y, p_world.z);
-    float3 p_view;
-    if (clip_near_plane(p_world, viewmat, p_view)) {
-        // printf("%d is out of frustum z %.2f, returning\n", idx, p_view.z);
-        return;
-    }
-    // printf("diff rast: p_view %d %.2f %.2f %.2f\n", idx, p_view.x, p_view.y,
-    // p_view.z);
-
-    // compute the projected covariance
-    float3 scale = scales[idx];
-    float4 quat = quats[idx];
-    // printf("%d scale %.2f %.2f %.2f\n", idx, scale.x, scale.y, scale.z);
-    // printf("%d quat %.2f %.2f %.2f %.2f\n", idx, quat.w, quat.x, quat.y,
-    // quat.z);
-    float *cur_cov3d = &(covs3d[6 * idx]);
-    scale_rot_to_cov3d(scale, glob_scale, quat, cur_cov3d);
-
-    // project to 2d with ewa approximation
-    float tan_fovx = 0.5 * img_size.x / fx;
-    float tan_fovy = 0.5 * img_size.y / fy;
-    float3 cov2d = project_cov3d_ewa(p_world, cur_cov3d, viewmat, fx, fy, tan_fovx, tan_fovy);
-    //printf("diff rast: cov2d %d, %.2f %.2f %.2f\n", idx, cov2d.x, cov2d.y, cov2d.z);
-
-    float3 conic;
-    float radius;
-    bool ok = compute_cov2d_bounds(cov2d, conic, radius);
-    if (!ok)
-        return; // zero determinant
-    // printf("diff rast: conic %d %.2f %.2f %.2f\n", idx, conic.x, conic.y,
-    // conic.z);
-    conics[idx] = conic;
-
-    // compute the projected mean
-    float2 center = project_pix(projmat, p_world, img_size);
-    // printf("diff rast: point_image %d %.2f %.2f \n", idx, center.x,
-    // center.y);
-    uint2 tile_min, tile_max;
-    get_tile_bbox(center, radius, tile_bounds, tile_min, tile_max);
-    // printf("diff rast: tile_min %d %d \n", tile_min.x, tile_min.y);
-    int32_t tile_area = (tile_max.x - tile_min.x) * (tile_max.y - tile_min.y);
-
-    num_tiles_hit[idx] = tile_area;
-    depths[idx] = p_view.z;
-    radii[idx] = (int)radius;
-    xys[idx] = center;
-    printf(
-        "diff rast: point %d x %.2f y %.2f z %.2f, radius %d, # tiles %d, "
-        "tile_min %d %d, tile_max %d %d \n",
-        idx,
-        center.x,
-        center.y,
-        depths[idx],
-        radii[idx],
-        tile_area,
-        tile_min.x,
-        tile_min.y,
-        tile_max.x,
-        tile_max.y
-    );
-}
-
-void compare_cov2d_forward() {
-    // TODO: test with more than one point and varying cov3d/viewmat
-    int num_points = 1;
-    float fx = 1;
-    float fy = 1;
-    const int W = 256;
-    const int H = 256 * 2;
-    float tan_fovx = 0.5 * W / fx;
-    float tan_fovy = 0.5 * H / fy;
-    float viewmat[] = {
-        1.f,
-        0.f,
-        0.f,
-        0.f,
-        0.f,
-        1.f,
-        0.f,
-        0.f,
-        0.f,
-        0.f,
-        1.f,
-        8.f,
-        0.f,
-        0.f,
-        0.f,
-        1.f};
-    float viewmatColumnMajor[] = {
-        1.0f,
-        0.0f,
-        0.0f,
-        0.0f,
-        0.0f,
-        1.0f,
-        0.0f,
-        0.0f,
-        0.0f,
-        0.0f,
-        1.0f,
-        0.0f,
-        0.0f,
-        0.0f,
-        8.0f,
-        1.0f};
-    float cov3d[]{1.f, 0.f, 0.f, 1.f, 0.f, 1.f};
-    float3 mean = {1.f, 1.f, 1.f};
-
-    // diff rast
-    float3 diff_rast_cov2d = project_cov3d_ewa(mean,cov3d, viewmat, fx, fy, tan_fovx, tan_fovy);
-    std::cout<<"diff rast cov2d: "<<diff_rast_cov2d.x<<" "<<diff_rast_cov2d.y<<" "<<diff_rast_cov2d.z<<std::endl; 
-    // ref rast
-    float3 res = computeCov2D(
-        mean, fx, fy, tan_fovx, tan_fovy, cov3d, viewmatColumnMajor
-    );
-    std::cout << "ref rast cov2d:  " << res.x << " " << res.y << " " << res.z
-              << std::endl;
-}
-
-void compare_scale_rot_to_cov3d() {
-    // TODO: make it work with more than one point
-    int num_points = 1;
-    float3 scale = {random_float(), random_float(), random_float()};
-    float glob_scale = random_float();
-    float4 quat = random_quat();
-    float *cov3d = new float[6 * num_points];
-
-    // diff rast
-    scale_rot_to_cov3d(scale, glob_scale, quat, cov3d);
-    std::cout << "diff rast cov3d: " << cov3d[0] << " " << cov3d[1] << " "
-              << cov3d[2] << std::endl;
-
-    // ref rast
-    float *ref_cov3d = new float[6 * num_points];
-    computeCov3D(
-        glm::vec3({scale.x, scale.y, scale.z}),
-        glob_scale,
-        glm::vec4({quat.x, quat.y, quat.z, quat.w}),
-        ref_cov3d
-    );
-    std::cout << "ref rast cov3d: " << ref_cov3d[0] << " " << ref_cov3d[1]
-              << " " << ref_cov3d[2] << std::endl;
-}
-
-void compare_project_preprocess() {
-    // setup
-    const int num_points = 1;
-    const float3 mean = {5.f, 1.f, 1.f};
-    float3 *means3d = new float3[num_points];
-    means3d[0] = mean;
-    const float3 scale = {random_float(), random_float(), random_float()};
-    const float3 *scales = &scale;
-    const float glob_scale = 1;
-    float4 temp_quat = random_quat();
-    float4 *quats = &temp_quat;
-    const float viewmat[] = {
-        1.f,
-        0.f,
-        0.f,
-        0.f,
-        0.f,
-        1.f,
-        0.f,
-        0.f,
-        0.f,
-        0.f,
-        1.f,
-        8.f,
-        0.f,
-        0.f,
-        0.f,
-        1.f};
-
-    const float *viewmats = viewmat;
-    const float *projmats = viewmat;
-
-    float *covs3d = new float[num_points * 6];
-    float2 *xys = new float2[num_points * 2];
-    float *depths = new float[num_points];
-    int *radii = new int[num_points];
-    float3 *conics = new float3[num_points];
-    int32_t *num_tiles_hit = new int32_t[num_points];
-
-    const float fov_x = M_PI / 2.f;
-    const int W = 512;
-    const int H = 512 * 2;
-    const float focal = 0.5 * (float)W / tan(0.5 * fov_x);
-    float tan_fovx = 0.5 * W / focal;
-    float tan_fovy = 0.5 * H / focal;
-    const dim3 tile_bounds = {
-        (W + BLOCK_X - 1) / BLOCK_X, (H + BLOCK_Y - 1) / BLOCK_Y, 1};
-    const dim3 img_size = {W, H, 1};
-    const dim3 block = {BLOCK_X, BLOCK_Y, 1};
-
-    // diff rast
-    host_project_gaussians_forward_kernel(
-        num_points,
-        means3d,
-        scales,
-        glob_scale,
-        quats,
-        viewmats,
-        projmats,
-        focal,
-        focal,
-        img_size,
-        tile_bounds,
-        covs3d,
-        xys,
-        depths,
-        radii,
-        conics,
-        num_tiles_hit
-    );
-
-    float *covs3d_ref = new float[num_points * 6];
-    float2 *xys_ref = new float2[num_points * 2];
-    float *depths_ref = new float[num_points];
-    int *radii_ref = new int[num_points];
-    float3 *conics_ref = new float3[num_points];
-    uint32_t *num_tiles_hit_ref = new uint32_t[num_points];
-
-    glm::vec3 scales_glm = glm::vec3({scale.x, scale.y, scale.z});
-    glm::vec3 *scales_ref = &scales_glm;
-    glm::vec4 quat_glm =
-        glm::vec4({temp_quat.x, temp_quat.y, temp_quat.z, temp_quat.w});
-    glm::vec4 *quat_ref = &quat_glm;
-    glm::vec3 *cam_pos = nullptr;
-
-    bool clamped_val = false;
-    bool *clamped = &clamped_val;
-    float *colors_precomp = nullptr;
-    float *rgb = nullptr;
-    float4 *conic_opacity = nullptr;
-    float *cov3D_precomp = nullptr;
-    float *opacities = new float[num_points];
-    opacities[0] = 0.9f;
-    float *shs = nullptr;
-    float viewmatColumnMajor[] = {
-        1.0f,
-        0.0f,
-        0.0f,
-        0.0f,
-        0.0f,
-        1.0f,
-        0.0f,
-        0.0f,
-        0.0f,
-        0.0f,
-        1.0f,
-        0.0f,
-        0.0f,
-        0.0f,
-        8.0f,
-        1.0f};
-    const float *viewmats_ref = viewmatColumnMajor;
-
-    // ref rast
-    host_preprocessCUDA(
-        num_points,
-        (float *)means3d,
-        scales_ref,
-        glob_scale,
-        quat_ref,
-        opacities,
-        shs,
-        clamped,
-        cov3D_precomp,
-        colors_precomp,
-        viewmats_ref,
-        viewmats_ref,
-        cam_pos,
-        W,
-        H,
-        tan_fovx,
-        tan_fovy,
-        focal,
-        focal,
-        radii_ref,
-        xys_ref,
-        depths_ref,
-        covs3d_ref,
-        rgb,
-        conic_opacity,
-        tile_bounds, // this is called "grid" in ref
-        num_tiles_hit_ref
-    );
-}
-int main() {
-    // compare_cov2d_forward();
-    // compare_scale_rot_to_cov3d();
-    compare_project_preprocess();
-}
diff --git a/csrc/debug_utils.cpp b/csrc/debug_utils.cpp
deleted file mode 100644
index c148d0a0a..000000000
--- a/csrc/debug_utils.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-#include "debug_utils.h"
-
-#include <iostream>
-#include <iomanip>
-
-float percent_error(float exp_val, float true_val) {
-    return (abs(true_val - exp_val) / true_val ) * 100.f;
-}
-
-std::vector<float> percent_errors(
-    const std::vector<std::pair<float,float>>& values
-) {
-    std::vector<float> p_errors;
-
-    for (int i = 0; i < values.size(); ++i) {
-        const float exp_val  = values[i].first;
-        const float true_val = values[i].second;
-        const float p_error{ percent_error(exp_val, true_val) };
-        p_errors.push_back(p_error);
-    }
-
-    return p_errors;
-}
-
-void print_errors(
-    const std::vector<std::pair<float, float>> &values,
-    const std::string &name,
-    int check_index,
-    float error_threshold // by default percent error (0.01%)
-) {
-    const std::vector<float> p_errors{percent_errors(values)};
-
-    // only print if input is unexpected
-    bool data_within_error{true};
-    for (float p : p_errors)
-        if (p > error_threshold)
-            data_within_error = false;
-
-    if (data_within_error)
-        return;
-
-    // format cout for how we want to display data
-    std::cout << std::setprecision(2);
-
-    std::cout << "Error on check: " << check_index << '\n'
-              << name << ":\n"
-              << "     ours:     refs:\n";
-
-    for (int i = 0; i < values.size(); ++i) {
-        const float d1 = values[i].first;
-        const float d2 = values[i].second;
-        std::cout << '[' << i << "]: " << std::scientific 
-                  << std::setw(10) << d1 << ", " 
-                  << std::setw(10) << d2
-                  << "\t(percent error=" << std::fixed << p_errors[i] << ")\n";
-    }
-    std::cout << '\n';
-}
\ No newline at end of file
diff --git a/csrc/debug_utils.h b/csrc/debug_utils.h
deleted file mode 100644
index e6409cca6..000000000
--- a/csrc/debug_utils.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#pragma once
-
-#include <string>
-#include <utility>
-#include <vector>
-
-float percent_error(float exp_val, float true_val);
-
-// takes a vector of float pairs, where first value is experimental, 
-// and second is true value
-std::vector<float> percent_errors(
-    const std::vector<std::pair<float,float>>& values
-);
-
-// takes a vector of float pairs, where pair's first value is 
-// "experimental value", and second is "true value". If any pair's percent exceeds 
-// error threshold, the entire vector will be displayed in a nice lil format
-void print_errors(
-    const std::vector<std::pair<float, float>> &values,
-    const std::string &name,
-    int check_index,
-    float error_threshold = 0.01f // percent error (0.01%)
-);
\ No newline at end of file
diff --git a/csrc/main.cpp b/csrc/main.cpp
deleted file mode 100644
index 83da83f72..000000000
--- a/csrc/main.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#include "tgaimage.h"
-int main() {
-    TGAColor red;
-    red[2] = 255;
-    red[3] = 255;
-    TGAImage image(100, 100, TGAImage::RGB);
-    image.set(52, 41, red);
-    image.flip_vertically(
-    ); // i want to have the origin at the left bottom corner of the image
-    image.write_tga_file("output.tga");
-    return 0;
-}
diff --git a/csrc/reference/cuda_rasterizer/auxiliary.h b/csrc/reference/cuda_rasterizer/auxiliary.h
deleted file mode 100644
index 84d8391cd..000000000
--- a/csrc/reference/cuda_rasterizer/auxiliary.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (C) 2023, Inria
- * GRAPHDECO research group, https://team.inria.fr/graphdeco
- * All rights reserved.
- *
- * This software is free for non-commercial, research and evaluation use 
- * under the terms of the LICENSE.md file.
- *
- * For inquiries contact  george.drettakis@inria.fr
- */
-
-#ifndef CUDA_RASTERIZER_AUXILIARY_H_INCLUDED
-#define CUDA_RASTERIZER_AUXILIARY_H_INCLUDED
-
-#include "config.h"
-#include "stdio.h"
-
-#define BLOCK_SIZE (BLOCK_X * BLOCK_Y)
-#define NUM_WARPS (BLOCK_SIZE/32)
-
-// Spherical harmonics coefficients
-__device__ const float SH_C0 = 0.28209479177387814f;
-__device__ const float SH_C1 = 0.4886025119029199f;
-__device__ const float SH_C2[] = {
-	1.0925484305920792f,
-	-1.0925484305920792f,
-	0.31539156525252005f,
-	-1.0925484305920792f,
-	0.5462742152960396f
-};
-__device__ const float SH_C3[] = {
-	-0.5900435899266435f,
-	2.890611442640554f,
-	-0.4570457994644658f,
-	0.3731763325901154f,
-	-0.4570457994644658f,
-	1.445305721320277f,
-	-0.5900435899266435f
-};
-
-__forceinline__ __host__ __device__ float ndc2Pix(float v, int S)
-{
-	return ((v + 1.0) * S - 1.0) * 0.5;
-}
-
-__forceinline__ __host__ __device__ void getRect(const float2 p, int max_radius, uint2& rect_min, uint2& rect_max, dim3 grid)
-{
-	rect_min = {
-		min(grid.x, max((int)0, (int)((p.x - max_radius) / BLOCK_X))),
-		min(grid.y, max((int)0, (int)((p.y - max_radius) / BLOCK_Y)))
-	};
-	rect_max = {
-		min(grid.x, max((int)0, (int)((p.x + max_radius + BLOCK_X - 1) / BLOCK_X))),
-		min(grid.y, max((int)0, (int)((p.y + max_radius + BLOCK_Y - 1) / BLOCK_Y)))
-	};
-}
-
-__forceinline__ __host__ __device__ float3 transformPoint4x3(const float3& p, const float* matrix)
-{
-	float3 transformed = {
-		matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12],
-		matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13],
-		matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14],
-	};
-	return transformed;
-}
-
-__forceinline__ __host__ __device__ float4 transformPoint4x4(const float3& p, const float* matrix)
-{
-	float4 transformed = {
-		matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12],
-		matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13],
-		matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14],
-		matrix[3] * p.x + matrix[7] * p.y + matrix[11] * p.z + matrix[15]
-	};
-	return transformed;
-}
-
-__forceinline__ __device__ float3 transformVec4x3(const float3& p, const float* matrix)
-{
-	float3 transformed = {
-		matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z,
-		matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z,
-		matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z,
-	};
-	return transformed;
-}
-
-__forceinline__ __device__ float3 transformVec4x3Transpose(const float3& p, const float* matrix)
-{
-	float3 transformed = {
-		matrix[0] * p.x + matrix[1] * p.y + matrix[2] * p.z,
-		matrix[4] * p.x + matrix[5] * p.y + matrix[6] * p.z,
-		matrix[8] * p.x + matrix[9] * p.y + matrix[10] * p.z,
-	};
-	return transformed;
-}
-
-__forceinline__ __device__ float dnormvdz(float3 v, float3 dv)
-{
-	float sum2 = v.x * v.x + v.y * v.y + v.z * v.z;
-	float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
-	float dnormvdz = (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) * invsum32;
-	return dnormvdz;
-}
-
-__forceinline__ __device__ float3 dnormvdv(float3 v, float3 dv)
-{
-	float sum2 = v.x * v.x + v.y * v.y + v.z * v.z;
-	float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
-
-	float3 dnormvdv;
-	dnormvdv.x = ((+sum2 - v.x * v.x) * dv.x - v.y * v.x * dv.y - v.z * v.x * dv.z) * invsum32;
-	dnormvdv.y = (-v.x * v.y * dv.x + (sum2 - v.y * v.y) * dv.y - v.z * v.y * dv.z) * invsum32;
-	dnormvdv.z = (-v.x * v.z * dv.x - v.y * v.z * dv.y + (sum2 - v.z * v.z) * dv.z) * invsum32;
-	return dnormvdv;
-}
-
-__forceinline__ __device__ float4 dnormvdv(float4 v, float4 dv)
-{
-	float sum2 = v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w;
-	float invsum32 = 1.0f / sqrt(sum2 * sum2 * sum2);
-
-	float4 vdv = { v.x * dv.x, v.y * dv.y, v.z * dv.z, v.w * dv.w };
-	float vdv_sum = vdv.x + vdv.y + vdv.z + vdv.w;
-	float4 dnormvdv;
-	dnormvdv.x = ((sum2 - v.x * v.x) * dv.x - v.x * (vdv_sum - vdv.x)) * invsum32;
-	dnormvdv.y = ((sum2 - v.y * v.y) * dv.y - v.y * (vdv_sum - vdv.y)) * invsum32;
-	dnormvdv.z = ((sum2 - v.z * v.z) * dv.z - v.z * (vdv_sum - vdv.z)) * invsum32;
-	dnormvdv.w = ((sum2 - v.w * v.w) * dv.w - v.w * (vdv_sum - vdv.w)) * invsum32;
-	return dnormvdv;
-}
-
-__forceinline__ __device__ float sigmoid(float x)
-{
-	return 1.0f / (1.0f + expf(-x));
-}
-
-__forceinline__ __host__ __device__ bool in_frustum(int idx,
-	const float* orig_points,
-	const float* viewmatrix,
-	const float* projmatrix,
-	bool prefiltered,
-	float3& p_view)
-{
-	float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
-
-	// Bring points to screen space
-	float4 p_hom = transformPoint4x4(p_orig, projmatrix);
-	float p_w = 1.0f / (p_hom.w + 0.0000001f);
-	float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
-	p_view = transformPoint4x3(p_orig, viewmatrix);
-
-	if (p_view.z <= 0.2f)// || ((p_proj.x < -1.3 || p_proj.x > 1.3 || p_proj.y < -1.3 || p_proj.y > 1.3)))
-	{
-		if (prefiltered)
-		{
-			printf("Point is filtered although prefiltered is set. This shouldn't happen!");
-			//__trap();
-		}
-		return false;
-	}
-	return true;
-}
-
-#define CHECK_CUDA(A, debug) \
-A; if(debug) { \
-auto ret = cudaDeviceSynchronize(); \
-if (ret != cudaSuccess) { \
-std::cerr << "\n[CUDA ERROR] in " << __FILE__ << "\nLine " << __LINE__ << ": " << cudaGetErrorString(ret); \
-throw std::runtime_error(cudaGetErrorString(ret)); \
-} \
-}
-
-#endif
\ No newline at end of file
diff --git a/csrc/reference/cuda_rasterizer/backward.cu b/csrc/reference/cuda_rasterizer/backward.cu
deleted file mode 100644
index 4aa41e1cb..000000000
--- a/csrc/reference/cuda_rasterizer/backward.cu
+++ /dev/null
@@ -1,657 +0,0 @@
-/*
- * Copyright (C) 2023, Inria
- * GRAPHDECO research group, https://team.inria.fr/graphdeco
- * All rights reserved.
- *
- * This software is free for non-commercial, research and evaluation use 
- * under the terms of the LICENSE.md file.
- *
- * For inquiries contact  george.drettakis@inria.fr
- */
-
-#include "backward.h"
-#include "auxiliary.h"
-#include <cooperative_groups.h>
-#include <cooperative_groups/reduce.h>
-namespace cg = cooperative_groups;
-
-// Backward pass for conversion of spherical harmonics to RGB for
-// each Gaussian.
-__device__ void computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, const bool* clamped, const glm::vec3* dL_dcolor, glm::vec3* dL_dmeans, glm::vec3* dL_dshs)
-{
-	// Compute intermediate values, as it is done during forward
-	glm::vec3 pos = means[idx];
-	glm::vec3 dir_orig = pos - campos;
-	glm::vec3 dir = dir_orig / glm::length(dir_orig);
-
-	glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
-
-	// Use PyTorch rule for clamping: if clamping was applied,
-	// gradient becomes 0.
-	glm::vec3 dL_dRGB = dL_dcolor[idx];
-	dL_dRGB.x *= clamped[3 * idx + 0] ? 0 : 1;
-	dL_dRGB.y *= clamped[3 * idx + 1] ? 0 : 1;
-	dL_dRGB.z *= clamped[3 * idx + 2] ? 0 : 1;
-
-	glm::vec3 dRGBdx(0, 0, 0);
-	glm::vec3 dRGBdy(0, 0, 0);
-	glm::vec3 dRGBdz(0, 0, 0);
-	float x = dir.x;
-	float y = dir.y;
-	float z = dir.z;
-
-	// Target location for this Gaussian to write SH gradients to
-	glm::vec3* dL_dsh = dL_dshs + idx * max_coeffs;
-
-	// No tricks here, just high school-level calculus.
-	float dRGBdsh0 = SH_C0;
-	dL_dsh[0] = dRGBdsh0 * dL_dRGB;
-	if (deg > 0)
-	{
-		float dRGBdsh1 = -SH_C1 * y;
-		float dRGBdsh2 = SH_C1 * z;
-		float dRGBdsh3 = -SH_C1 * x;
-		dL_dsh[1] = dRGBdsh1 * dL_dRGB;
-		dL_dsh[2] = dRGBdsh2 * dL_dRGB;
-		dL_dsh[3] = dRGBdsh3 * dL_dRGB;
-
-		dRGBdx = -SH_C1 * sh[3];
-		dRGBdy = -SH_C1 * sh[1];
-		dRGBdz = SH_C1 * sh[2];
-
-		if (deg > 1)
-		{
-			float xx = x * x, yy = y * y, zz = z * z;
-			float xy = x * y, yz = y * z, xz = x * z;
-
-			float dRGBdsh4 = SH_C2[0] * xy;
-			float dRGBdsh5 = SH_C2[1] * yz;
-			float dRGBdsh6 = SH_C2[2] * (2.f * zz - xx - yy);
-			float dRGBdsh7 = SH_C2[3] * xz;
-			float dRGBdsh8 = SH_C2[4] * (xx - yy);
-			dL_dsh[4] = dRGBdsh4 * dL_dRGB;
-			dL_dsh[5] = dRGBdsh5 * dL_dRGB;
-			dL_dsh[6] = dRGBdsh6 * dL_dRGB;
-			dL_dsh[7] = dRGBdsh7 * dL_dRGB;
-			dL_dsh[8] = dRGBdsh8 * dL_dRGB;
-
-			dRGBdx += SH_C2[0] * y * sh[4] + SH_C2[2] * 2.f * -x * sh[6] + SH_C2[3] * z * sh[7] + SH_C2[4] * 2.f * x * sh[8];
-			dRGBdy += SH_C2[0] * x * sh[4] + SH_C2[1] * z * sh[5] + SH_C2[2] * 2.f * -y * sh[6] + SH_C2[4] * 2.f * -y * sh[8];
-			dRGBdz += SH_C2[1] * y * sh[5] + SH_C2[2] * 2.f * 2.f * z * sh[6] + SH_C2[3] * x * sh[7];
-
-			if (deg > 2)
-			{
-				float dRGBdsh9 = SH_C3[0] * y * (3.f * xx - yy);
-				float dRGBdsh10 = SH_C3[1] * xy * z;
-				float dRGBdsh11 = SH_C3[2] * y * (4.f * zz - xx - yy);
-				float dRGBdsh12 = SH_C3[3] * z * (2.f * zz - 3.f * xx - 3.f * yy);
-				float dRGBdsh13 = SH_C3[4] * x * (4.f * zz - xx - yy);
-				float dRGBdsh14 = SH_C3[5] * z * (xx - yy);
-				float dRGBdsh15 = SH_C3[6] * x * (xx - 3.f * yy);
-				dL_dsh[9] = dRGBdsh9 * dL_dRGB;
-				dL_dsh[10] = dRGBdsh10 * dL_dRGB;
-				dL_dsh[11] = dRGBdsh11 * dL_dRGB;
-				dL_dsh[12] = dRGBdsh12 * dL_dRGB;
-				dL_dsh[13] = dRGBdsh13 * dL_dRGB;
-				dL_dsh[14] = dRGBdsh14 * dL_dRGB;
-				dL_dsh[15] = dRGBdsh15 * dL_dRGB;
-
-				dRGBdx += (
-					SH_C3[0] * sh[9] * 3.f * 2.f * xy +
-					SH_C3[1] * sh[10] * yz +
-					SH_C3[2] * sh[11] * -2.f * xy +
-					SH_C3[3] * sh[12] * -3.f * 2.f * xz +
-					SH_C3[4] * sh[13] * (-3.f * xx + 4.f * zz - yy) +
-					SH_C3[5] * sh[14] * 2.f * xz +
-					SH_C3[6] * sh[15] * 3.f * (xx - yy));
-
-				dRGBdy += (
-					SH_C3[0] * sh[9] * 3.f * (xx - yy) +
-					SH_C3[1] * sh[10] * xz +
-					SH_C3[2] * sh[11] * (-3.f * yy + 4.f * zz - xx) +
-					SH_C3[3] * sh[12] * -3.f * 2.f * yz +
-					SH_C3[4] * sh[13] * -2.f * xy +
-					SH_C3[5] * sh[14] * -2.f * yz +
-					SH_C3[6] * sh[15] * -3.f * 2.f * xy);
-
-				dRGBdz += (
-					SH_C3[1] * sh[10] * xy +
-					SH_C3[2] * sh[11] * 4.f * 2.f * yz +
-					SH_C3[3] * sh[12] * 3.f * (2.f * zz - xx - yy) +
-					SH_C3[4] * sh[13] * 4.f * 2.f * xz +
-					SH_C3[5] * sh[14] * (xx - yy));
-			}
-		}
-	}
-
-	// The view direction is an input to the computation. View direction
-	// is influenced by the Gaussian's mean, so SHs gradients
-	// must propagate back into 3D position.
-	glm::vec3 dL_ddir(glm::dot(dRGBdx, dL_dRGB), glm::dot(dRGBdy, dL_dRGB), glm::dot(dRGBdz, dL_dRGB));
-
-	// Account for normalization of direction
-	float3 dL_dmean = dnormvdv(float3{ dir_orig.x, dir_orig.y, dir_orig.z }, float3{ dL_ddir.x, dL_ddir.y, dL_ddir.z });
-
-	// Gradients of loss w.r.t. Gaussian means, but only the portion 
-	// that is caused because the mean affects the view-dependent color.
-	// Additional mean gradient is accumulated in below methods.
-	dL_dmeans[idx] += glm::vec3(dL_dmean.x, dL_dmean.y, dL_dmean.z);
-}
-
-// Backward version of INVERSE 2D covariance matrix computation
-// (due to length launched as separate kernel before other 
-// backward steps contained in preprocess)
-__global__ void computeCov2DCUDA(int P,
-	const float3* means,
-	const int* radii,
-	const float* cov3Ds,
-	const float h_x, float h_y,
-	const float tan_fovx, float tan_fovy,
-	const float* view_matrix,
-	const float* dL_dconics,
-	float3* dL_dmeans,
-	float* dL_dcov)
-{
-	auto idx = cg::this_grid().thread_rank();
-	if (idx >= P || !(radii[idx] > 0))
-		return;
-
-	// Reading location of 3D covariance for this Gaussian
-	const float* cov3D = cov3Ds + 6 * idx;
-
-	// Fetch gradients, recompute 2D covariance and relevant 
-	// intermediate forward results needed in the backward.
-	float3 mean = means[idx];
-	float3 dL_dconic = { dL_dconics[4 * idx], dL_dconics[4 * idx + 1], dL_dconics[4 * idx + 3] };
-	float3 t = transformPoint4x3(mean, view_matrix);
-	
-	const float limx = 1.3f * tan_fovx;
-	const float limy = 1.3f * tan_fovy;
-	const float txtz = t.x / t.z;
-	const float tytz = t.y / t.z;
-	t.x = min(limx, max(-limx, txtz)) * t.z;
-	t.y = min(limy, max(-limy, tytz)) * t.z;
-	
-	const float x_grad_mul = txtz < -limx || txtz > limx ? 0 : 1;
-	const float y_grad_mul = tytz < -limy || tytz > limy ? 0 : 1;
-
-	glm::mat3 J = glm::mat3(h_x / t.z, 0.0f, -(h_x * t.x) / (t.z * t.z),
-		0.0f, h_y / t.z, -(h_y * t.y) / (t.z * t.z),
-		0, 0, 0);
-
-	glm::mat3 W = glm::mat3(
-		view_matrix[0], view_matrix[4], view_matrix[8],
-		view_matrix[1], view_matrix[5], view_matrix[9],
-		view_matrix[2], view_matrix[6], view_matrix[10]);
-
-	glm::mat3 Vrk = glm::mat3(
-		cov3D[0], cov3D[1], cov3D[2],
-		cov3D[1], cov3D[3], cov3D[4],
-		cov3D[2], cov3D[4], cov3D[5]);
-
-	glm::mat3 T = W * J;
-
-	glm::mat3 cov2D = glm::transpose(T) * glm::transpose(Vrk) * T;
-
-	// Use helper variables for 2D covariance entries. More compact.
-	float a = cov2D[0][0] += 0.3f;
-	float b = cov2D[0][1];
-	float c = cov2D[1][1] += 0.3f;
-
-	float denom = a * c - b * b;
-	float dL_da = 0, dL_db = 0, dL_dc = 0;
-	float denom2inv = 1.0f / ((denom * denom) + 0.0000001f);
-
-	if (denom2inv != 0)
-	{
-		// Gradients of loss w.r.t. entries of 2D covariance matrix,
-		// given gradients of loss w.r.t. conic matrix (inverse covariance matrix).
-		// e.g., dL / da = dL / d_conic_a * d_conic_a / d_a
-		dL_da = denom2inv * (-c * c * dL_dconic.x + 2 * b * c * dL_dconic.y + (denom - a * c) * dL_dconic.z);
-		dL_dc = denom2inv * (-a * a * dL_dconic.z + 2 * a * b * dL_dconic.y + (denom - a * c) * dL_dconic.x);
-		dL_db = denom2inv * 2 * (b * c * dL_dconic.x - (denom + 2 * b * b) * dL_dconic.y + a * b * dL_dconic.z);
-
-		// Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry, 
-		// given gradients w.r.t. 2D covariance matrix (diagonal).
-		// cov2D = transpose(T) * transpose(Vrk) * T;
-		dL_dcov[6 * idx + 0] = (T[0][0] * T[0][0] * dL_da + T[0][0] * T[1][0] * dL_db + T[1][0] * T[1][0] * dL_dc);
-		dL_dcov[6 * idx + 3] = (T[0][1] * T[0][1] * dL_da + T[0][1] * T[1][1] * dL_db + T[1][1] * T[1][1] * dL_dc);
-		dL_dcov[6 * idx + 5] = (T[0][2] * T[0][2] * dL_da + T[0][2] * T[1][2] * dL_db + T[1][2] * T[1][2] * dL_dc);
-
-		// Gradients of loss L w.r.t. each 3D covariance matrix (Vrk) entry, 
-		// given gradients w.r.t. 2D covariance matrix (off-diagonal).
-		// Off-diagonal elements appear twice --> double the gradient.
-		// cov2D = transpose(T) * transpose(Vrk) * T;
-		dL_dcov[6 * idx + 1] = 2 * T[0][0] * T[0][1] * dL_da + (T[0][0] * T[1][1] + T[0][1] * T[1][0]) * dL_db + 2 * T[1][0] * T[1][1] * dL_dc;
-		dL_dcov[6 * idx + 2] = 2 * T[0][0] * T[0][2] * dL_da + (T[0][0] * T[1][2] + T[0][2] * T[1][0]) * dL_db + 2 * T[1][0] * T[1][2] * dL_dc;
-		dL_dcov[6 * idx + 4] = 2 * T[0][2] * T[0][1] * dL_da + (T[0][1] * T[1][2] + T[0][2] * T[1][1]) * dL_db + 2 * T[1][1] * T[1][2] * dL_dc;
-	}
-	else
-	{
-		for (int i = 0; i < 6; i++)
-			dL_dcov[6 * idx + i] = 0;
-	}
-
-	// Gradients of loss w.r.t. upper 2x3 portion of intermediate matrix T
-	// cov2D = transpose(T) * transpose(Vrk) * T;
-	float dL_dT00 = 2 * (T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) * dL_da +
-		(T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) * dL_db;
-	float dL_dT01 = 2 * (T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) * dL_da +
-		(T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) * dL_db;
-	float dL_dT02 = 2 * (T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) * dL_da +
-		(T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) * dL_db;
-	float dL_dT10 = 2 * (T[1][0] * Vrk[0][0] + T[1][1] * Vrk[0][1] + T[1][2] * Vrk[0][2]) * dL_dc +
-		(T[0][0] * Vrk[0][0] + T[0][1] * Vrk[0][1] + T[0][2] * Vrk[0][2]) * dL_db;
-	float dL_dT11 = 2 * (T[1][0] * Vrk[1][0] + T[1][1] * Vrk[1][1] + T[1][2] * Vrk[1][2]) * dL_dc +
-		(T[0][0] * Vrk[1][0] + T[0][1] * Vrk[1][1] + T[0][2] * Vrk[1][2]) * dL_db;
-	float dL_dT12 = 2 * (T[1][0] * Vrk[2][0] + T[1][1] * Vrk[2][1] + T[1][2] * Vrk[2][2]) * dL_dc +
-		(T[0][0] * Vrk[2][0] + T[0][1] * Vrk[2][1] + T[0][2] * Vrk[2][2]) * dL_db;
-
-	// Gradients of loss w.r.t. upper 3x2 non-zero entries of Jacobian matrix
-	// T = W * J
-	float dL_dJ00 = W[0][0] * dL_dT00 + W[0][1] * dL_dT01 + W[0][2] * dL_dT02;
-	float dL_dJ02 = W[2][0] * dL_dT00 + W[2][1] * dL_dT01 + W[2][2] * dL_dT02;
-	float dL_dJ11 = W[1][0] * dL_dT10 + W[1][1] * dL_dT11 + W[1][2] * dL_dT12;
-	float dL_dJ12 = W[2][0] * dL_dT10 + W[2][1] * dL_dT11 + W[2][2] * dL_dT12;
-
-	float tz = 1.f / t.z;
-	float tz2 = tz * tz;
-	float tz3 = tz2 * tz;
-
-	// Gradients of loss w.r.t. transformed Gaussian mean t
-	float dL_dtx = x_grad_mul * -h_x * tz2 * dL_dJ02;
-	float dL_dty = y_grad_mul * -h_y * tz2 * dL_dJ12;
-	float dL_dtz = -h_x * tz2 * dL_dJ00 - h_y * tz2 * dL_dJ11 + (2 * h_x * t.x) * tz3 * dL_dJ02 + (2 * h_y * t.y) * tz3 * dL_dJ12;
-
-	// Account for transformation of mean to t
-	// t = transformPoint4x3(mean, view_matrix);
-	float3 dL_dmean = transformVec4x3Transpose({ dL_dtx, dL_dty, dL_dtz }, view_matrix);
-
-	// Gradients of loss w.r.t. Gaussian means, but only the portion 
-	// that is caused because the mean affects the covariance matrix.
-	// Additional mean gradient is accumulated in BACKWARD::preprocess.
-	dL_dmeans[idx] = dL_dmean;
-}
-
-// Backward pass for the conversion of scale and rotation to a 
-// 3D covariance matrix for each Gaussian. 
-__device__ void computeCov3D(int idx, const glm::vec3 scale, float mod, const glm::vec4 rot, const float* dL_dcov3Ds, glm::vec3* dL_dscales, glm::vec4* dL_drots)
-{
-	// Recompute (intermediate) results for the 3D covariance computation.
-	glm::vec4 q = rot;// / glm::length(rot);
-	float r = q.x;
-	float x = q.y;
-	float y = q.z;
-	float z = q.w;
-
-	glm::mat3 R = glm::mat3(
-		1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y),
-		2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x),
-		2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y)
-	);
-
-	glm::mat3 S = glm::mat3(1.0f);
-
-	glm::vec3 s = mod * scale;
-	S[0][0] = s.x;
-	S[1][1] = s.y;
-	S[2][2] = s.z;
-
-	glm::mat3 M = S * R;
-
-	const float* dL_dcov3D = dL_dcov3Ds + 6 * idx;
-
-	glm::vec3 dunc(dL_dcov3D[0], dL_dcov3D[3], dL_dcov3D[5]);
-	glm::vec3 ounc = 0.5f * glm::vec3(dL_dcov3D[1], dL_dcov3D[2], dL_dcov3D[4]);
-
-	// Convert per-element covariance loss gradients to matrix form
-	glm::mat3 dL_dSigma = glm::mat3(
-		dL_dcov3D[0], 0.5f * dL_dcov3D[1], 0.5f * dL_dcov3D[2],
-		0.5f * dL_dcov3D[1], dL_dcov3D[3], 0.5f * dL_dcov3D[4],
-		0.5f * dL_dcov3D[2], 0.5f * dL_dcov3D[4], dL_dcov3D[5]
-	);
-
-	// Compute loss gradient w.r.t. matrix M
-	// dSigma_dM = 2 * M
-	glm::mat3 dL_dM = 2.0f * M * dL_dSigma;
-
-	glm::mat3 Rt = glm::transpose(R);
-	glm::mat3 dL_dMt = glm::transpose(dL_dM);
-
-	// Gradients of loss w.r.t. scale
-	glm::vec3* dL_dscale = dL_dscales + idx;
-	dL_dscale->x = glm::dot(Rt[0], dL_dMt[0]);
-	dL_dscale->y = glm::dot(Rt[1], dL_dMt[1]);
-	dL_dscale->z = glm::dot(Rt[2], dL_dMt[2]);
-
-	dL_dMt[0] *= s.x;
-	dL_dMt[1] *= s.y;
-	dL_dMt[2] *= s.z;
-
-	// Gradients of loss w.r.t. normalized quaternion
-	glm::vec4 dL_dq;
-	dL_dq.x = 2 * z * (dL_dMt[0][1] - dL_dMt[1][0]) + 2 * y * (dL_dMt[2][0] - dL_dMt[0][2]) + 2 * x * (dL_dMt[1][2] - dL_dMt[2][1]);
-	dL_dq.y = 2 * y * (dL_dMt[1][0] + dL_dMt[0][1]) + 2 * z * (dL_dMt[2][0] + dL_dMt[0][2]) + 2 * r * (dL_dMt[1][2] - dL_dMt[2][1]) - 4 * x * (dL_dMt[2][2] + dL_dMt[1][1]);
-	dL_dq.z = 2 * x * (dL_dMt[1][0] + dL_dMt[0][1]) + 2 * r * (dL_dMt[2][0] - dL_dMt[0][2]) + 2 * z * (dL_dMt[1][2] + dL_dMt[2][1]) - 4 * y * (dL_dMt[2][2] + dL_dMt[0][0]);
-	dL_dq.w = 2 * r * (dL_dMt[0][1] - dL_dMt[1][0]) + 2 * x * (dL_dMt[2][0] + dL_dMt[0][2]) + 2 * y * (dL_dMt[1][2] + dL_dMt[2][1]) - 4 * z * (dL_dMt[1][1] + dL_dMt[0][0]);
-
-	// Gradients of loss w.r.t. unnormalized quaternion
-	float4* dL_drot = (float4*)(dL_drots + idx);
-	*dL_drot = float4{ dL_dq.x, dL_dq.y, dL_dq.z, dL_dq.w };//dnormvdv(float4{ rot.x, rot.y, rot.z, rot.w }, float4{ dL_dq.x, dL_dq.y, dL_dq.z, dL_dq.w });
-}
-
-// Backward pass of the preprocessing steps, except
-// for the covariance computation and inversion
-// (those are handled by a previous kernel call)
-template<int C>
-__global__ void preprocessCUDA(
-	int P, int D, int M,
-	const float3* means,
-	const int* radii,
-	const float* shs,
-	const bool* clamped,
-	const glm::vec3* scales,
-	const glm::vec4* rotations,
-	const float scale_modifier,
-	const float* proj,
-	const glm::vec3* campos,
-	const float3* dL_dmean2D,
-	glm::vec3* dL_dmeans,
-	float* dL_dcolor,
-	float* dL_dcov3D,
-	float* dL_dsh,
-	glm::vec3* dL_dscale,
-	glm::vec4* dL_drot)
-{
-	auto idx = cg::this_grid().thread_rank();
-	if (idx >= P || !(radii[idx] > 0))
-		return;
-
-	float3 m = means[idx];
-
-	// Taking care of gradients from the screenspace points
-	float4 m_hom = transformPoint4x4(m, proj);
-	float m_w = 1.0f / (m_hom.w + 0.0000001f);
-
-	// Compute loss gradient w.r.t. 3D means due to gradients of 2D means
-	// from rendering procedure
-	glm::vec3 dL_dmean;
-	float mul1 = (proj[0] * m.x + proj[4] * m.y + proj[8] * m.z + proj[12]) * m_w * m_w;
-	float mul2 = (proj[1] * m.x + proj[5] * m.y + proj[9] * m.z + proj[13]) * m_w * m_w;
-	dL_dmean.x = (proj[0] * m_w - proj[3] * mul1) * dL_dmean2D[idx].x + (proj[1] * m_w - proj[3] * mul2) * dL_dmean2D[idx].y;
-	dL_dmean.y = (proj[4] * m_w - proj[7] * mul1) * dL_dmean2D[idx].x + (proj[5] * m_w - proj[7] * mul2) * dL_dmean2D[idx].y;
-	dL_dmean.z = (proj[8] * m_w - proj[11] * mul1) * dL_dmean2D[idx].x + (proj[9] * m_w - proj[11] * mul2) * dL_dmean2D[idx].y;
-
-	// That's the second part of the mean gradient. Previous computation
-	// of cov2D and following SH conversion also affects it.
-	dL_dmeans[idx] += dL_dmean;
-
-	// Compute gradient updates due to computing colors from SHs
-	if (shs)
-		computeColorFromSH(idx, D, M, (glm::vec3*)means, *campos, shs, clamped, (glm::vec3*)dL_dcolor, (glm::vec3*)dL_dmeans, (glm::vec3*)dL_dsh);
-
-	// Compute gradient updates due to computing covariance from scale/rotation
-	if (scales)
-		computeCov3D(idx, scales[idx], scale_modifier, rotations[idx], dL_dcov3D, dL_dscale, dL_drot);
-}
-
-// Backward version of the rendering procedure.
-template <uint32_t C>
-__global__ void __launch_bounds__(BLOCK_X * BLOCK_Y)
-renderCUDA(
-	const uint2* __restrict__ ranges,
-	const uint32_t* __restrict__ point_list,
-	int W, int H,
-	const float* __restrict__ bg_color,
-	const float2* __restrict__ points_xy_image,
-	const float4* __restrict__ conic_opacity,
-	const float* __restrict__ colors,
-	const float* __restrict__ final_Ts,
-	const uint32_t* __restrict__ n_contrib,
-	const float* __restrict__ dL_dpixels,
-	float3* __restrict__ dL_dmean2D,
-	float4* __restrict__ dL_dconic2D,
-	float* __restrict__ dL_dopacity,
-	float* __restrict__ dL_dcolors)
-{
-	// We rasterize again. Compute necessary block info.
-	auto block = cg::this_thread_block();
-	const uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
-	const uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
-	const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
-	const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
-	const uint32_t pix_id = W * pix.y + pix.x;
-	const float2 pixf = { (float)pix.x, (float)pix.y };
-
-	const bool inside = pix.x < W&& pix.y < H;
-	const uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
-
-	const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
-
-	bool done = !inside;
-	int toDo = range.y - range.x;
-
-	__shared__ int collected_id[BLOCK_SIZE];
-	__shared__ float2 collected_xy[BLOCK_SIZE];
-	__shared__ float4 collected_conic_opacity[BLOCK_SIZE];
-	__shared__ float collected_colors[C * BLOCK_SIZE];
-
-	// In the forward, we stored the final value for T, the
-	// product of all (1 - alpha) factors. 
-	const float T_final = inside ? final_Ts[pix_id] : 0;
-	float T = T_final;
-
-	// We start from the back. The ID of the last contributing
-	// Gaussian is known from each pixel from the forward.
-	uint32_t contributor = toDo;
-	const int last_contributor = inside ? n_contrib[pix_id] : 0;
-
-	float accum_rec[C] = { 0 };
-	float dL_dpixel[C];
-	if (inside)
-		for (int i = 0; i < C; i++)
-			dL_dpixel[i] = dL_dpixels[i * H * W + pix_id];
-
-	float last_alpha = 0;
-	float last_color[C] = { 0 };
-
-	// Gradient of pixel coordinate w.r.t. normalized 
-	// screen-space viewport corrdinates (-1 to 1)
-	const float ddelx_dx = 0.5 * W;
-	const float ddely_dy = 0.5 * H;
-
-	// Traverse all Gaussians
-	for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
-	{
-		// Load auxiliary data into shared memory, start in the BACK
-		// and load them in revers order.
-		block.sync();
-		const int progress = i * BLOCK_SIZE + block.thread_rank();
-		if (range.x + progress < range.y)
-		{
-			const int coll_id = point_list[range.y - progress - 1];
-			collected_id[block.thread_rank()] = coll_id;
-			collected_xy[block.thread_rank()] = points_xy_image[coll_id];
-			collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
-			for (int i = 0; i < C; i++)
-				collected_colors[i * BLOCK_SIZE + block.thread_rank()] = colors[coll_id * C + i];
-		}
-		block.sync();
-
-		// Iterate over Gaussians
-		for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++)
-		{
-			// Keep track of current Gaussian ID. Skip, if this one
-			// is behind the last contributor for this pixel.
-			contributor--;
-			if (contributor >= last_contributor)
-				continue;
-
-			// Compute blending values, as before.
-			const float2 xy = collected_xy[j];
-			const float2 d = { xy.x - pixf.x, xy.y - pixf.y };
-			const float4 con_o = collected_conic_opacity[j];
-			const float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
-			if (power > 0.0f)
-				continue;
-
-			const float G = exp(power);
-			const float alpha = min(0.99f, con_o.w * G);
-			if (alpha < 1.0f / 255.0f)
-				continue;
-
-			T = T / (1.f - alpha);
-			const float dchannel_dcolor = alpha * T;
-
-			// Propagate gradients to per-Gaussian colors and keep
-			// gradients w.r.t. alpha (blending factor for a Gaussian/pixel
-			// pair).
-			float dL_dalpha = 0.0f;
-			const int global_id = collected_id[j];
-			for (int ch = 0; ch < C; ch++)
-			{
-				const float c = collected_colors[ch * BLOCK_SIZE + j];
-				// Update last color (to be used in the next iteration)
-				accum_rec[ch] = last_alpha * last_color[ch] + (1.f - last_alpha) * accum_rec[ch];
-				last_color[ch] = c;
-
-				const float dL_dchannel = dL_dpixel[ch];
-				dL_dalpha += (c - accum_rec[ch]) * dL_dchannel;
-				// Update the gradients w.r.t. color of the Gaussian. 
-				// Atomic, since this pixel is just one of potentially
-				// many that were affected by this Gaussian.
-				atomicAdd(&(dL_dcolors[global_id * C + ch]), dchannel_dcolor * dL_dchannel);
-			}
-			dL_dalpha *= T;
-			// Update last alpha (to be used in the next iteration)
-			last_alpha = alpha;
-
-			// Account for fact that alpha also influences how much of
-			// the background color is added if nothing left to blend
-			float bg_dot_dpixel = 0;
-			for (int i = 0; i < C; i++)
-				bg_dot_dpixel += bg_color[i] * dL_dpixel[i];
-			dL_dalpha += (-T_final / (1.f - alpha)) * bg_dot_dpixel;
-
-
-			// Helpful reusable temporary variables
-			const float dL_dG = con_o.w * dL_dalpha;
-			const float gdx = G * d.x;
-			const float gdy = G * d.y;
-			const float dG_ddelx = -gdx * con_o.x - gdy * con_o.y;
-			const float dG_ddely = -gdy * con_o.z - gdx * con_o.y;
-
-			// Update gradients w.r.t. 2D mean position of the Gaussian
-			atomicAdd(&dL_dmean2D[global_id].x, dL_dG * dG_ddelx * ddelx_dx);
-			atomicAdd(&dL_dmean2D[global_id].y, dL_dG * dG_ddely * ddely_dy);
-
-			// Update gradients w.r.t. 2D covariance (2x2 matrix, symmetric)
-			atomicAdd(&dL_dconic2D[global_id].x, -0.5f * gdx * d.x * dL_dG);
-			atomicAdd(&dL_dconic2D[global_id].y, -0.5f * gdx * d.y * dL_dG);
-			atomicAdd(&dL_dconic2D[global_id].w, -0.5f * gdy * d.y * dL_dG);
-
-			// Update gradients w.r.t. opacity of the Gaussian
-			atomicAdd(&(dL_dopacity[global_id]), G * dL_dalpha);
-		}
-	}
-}
-
-void BACKWARD::preprocess(
-	int P, int D, int M,
-	const float3* means3D,
-	const int* radii,
-	const float* shs,
-	const bool* clamped,
-	const glm::vec3* scales,
-	const glm::vec4* rotations,
-	const float scale_modifier,
-	const float* cov3Ds,
-	const float* viewmatrix,
-	const float* projmatrix,
-	const float focal_x, float focal_y,
-	const float tan_fovx, float tan_fovy,
-	const glm::vec3* campos,
-	const float3* dL_dmean2D,
-	const float* dL_dconic,
-	glm::vec3* dL_dmean3D,
-	float* dL_dcolor,
-	float* dL_dcov3D,
-	float* dL_dsh,
-	glm::vec3* dL_dscale,
-	glm::vec4* dL_drot)
-{
-	// Propagate gradients for the path of 2D conic matrix computation. 
-	// Somewhat long, thus it is its own kernel rather than being part of 
-	// "preprocess". When done, loss gradient w.r.t. 3D means has been
-	// modified and gradient w.r.t. 3D covariance matrix has been computed.	
-	computeCov2DCUDA << <(P + 255) / 256, 256 >> > (
-		P,
-		means3D,
-		radii,
-		cov3Ds,
-		focal_x,
-		focal_y,
-		tan_fovx,
-		tan_fovy,
-		viewmatrix,
-		dL_dconic,
-		(float3*)dL_dmean3D,
-		dL_dcov3D);
-
-	// Propagate gradients for remaining steps: finish 3D mean gradients,
-	// propagate color gradients to SH (if desireD), propagate 3D covariance
-	// matrix gradients to scale and rotation.
-	preprocessCUDA<NUM_CHANNELS> << < (P + 255) / 256, 256 >> > (
-		P, D, M,
-		(float3*)means3D,
-		radii,
-		shs,
-		clamped,
-		(glm::vec3*)scales,
-		(glm::vec4*)rotations,
-		scale_modifier,
-		projmatrix,
-		campos,
-		(float3*)dL_dmean2D,
-		(glm::vec3*)dL_dmean3D,
-		dL_dcolor,
-		dL_dcov3D,
-		dL_dsh,
-		dL_dscale,
-		dL_drot);
-}
-
-void BACKWARD::render(
-	const dim3 grid, const dim3 block,
-	const uint2* ranges,
-	const uint32_t* point_list,
-	int W, int H,
-	const float* bg_color,
-	const float2* means2D,
-	const float4* conic_opacity,
-	const float* colors,
-	const float* final_Ts,
-	const uint32_t* n_contrib,
-	const float* dL_dpixels,
-	float3* dL_dmean2D,
-	float4* dL_dconic2D,
-	float* dL_dopacity,
-	float* dL_dcolors)
-{
-	renderCUDA<NUM_CHANNELS> << <grid, block >> >(
-		ranges,
-		point_list,
-		W, H,
-		bg_color,
-		means2D,
-		conic_opacity,
-		colors,
-		final_Ts,
-		n_contrib,
-		dL_dpixels,
-		dL_dmean2D,
-		dL_dconic2D,
-		dL_dopacity,
-		dL_dcolors
-		);
-}
\ No newline at end of file
diff --git a/csrc/reference/cuda_rasterizer/backward.h b/csrc/reference/cuda_rasterizer/backward.h
deleted file mode 100644
index 93dd2e4be..000000000
--- a/csrc/reference/cuda_rasterizer/backward.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (C) 2023, Inria
- * GRAPHDECO research group, https://team.inria.fr/graphdeco
- * All rights reserved.
- *
- * This software is free for non-commercial, research and evaluation use 
- * under the terms of the LICENSE.md file.
- *
- * For inquiries contact  george.drettakis@inria.fr
- */
-
-#ifndef CUDA_RASTERIZER_BACKWARD_H_INCLUDED
-#define CUDA_RASTERIZER_BACKWARD_H_INCLUDED
-
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-#define GLM_FORCE_CUDA
-#include <glm/glm.hpp>
-
-namespace BACKWARD
-{
-	void render(
-		const dim3 grid, dim3 block,
-		const uint2* ranges,
-		const uint32_t* point_list,
-		int W, int H,
-		const float* bg_color,
-		const float2* means2D,
-		const float4* conic_opacity,
-		const float* colors,
-		const float* final_Ts,
-		const uint32_t* n_contrib,
-		const float* dL_dpixels,
-		float3* dL_dmean2D,
-		float4* dL_dconic2D,
-		float* dL_dopacity,
-		float* dL_dcolors);
-
-	void preprocess(
-		int P, int D, int M,
-		const float3* means,
-		const int* radii,
-		const float* shs,
-		const bool* clamped,
-		const glm::vec3* scales,
-		const glm::vec4* rotations,
-		const float scale_modifier,
-		const float* cov3Ds,
-		const float* view,
-		const float* proj,
-		const float focal_x, float focal_y,
-		const float tan_fovx, float tan_fovy,
-		const glm::vec3* campos,
-		const float3* dL_dmean2D,
-		const float* dL_dconics,
-		glm::vec3* dL_dmeans,
-		float* dL_dcolor,
-		float* dL_dcov3D,
-		float* dL_dsh,
-		glm::vec3* dL_dscale,
-		glm::vec4* dL_drot);
-}
-
-#endif
\ No newline at end of file
diff --git a/csrc/reference/cuda_rasterizer/config.h b/csrc/reference/cuda_rasterizer/config.h
deleted file mode 100644
index 2a912fb34..000000000
--- a/csrc/reference/cuda_rasterizer/config.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (C) 2023, Inria
- * GRAPHDECO research group, https://team.inria.fr/graphdeco
- * All rights reserved.
- *
- * This software is free for non-commercial, research and evaluation use 
- * under the terms of the LICENSE.md file.
- *
- * For inquiries contact  george.drettakis@inria.fr
- */
-
-#ifndef CUDA_RASTERIZER_CONFIG_H_INCLUDED
-#define CUDA_RASTERIZER_CONFIG_H_INCLUDED
-
-#define NUM_CHANNELS 3 // Default 3, RGB
-#define BLOCK_X 16
-#define BLOCK_Y 16
-
-#endif
\ No newline at end of file
diff --git a/csrc/reference/cuda_rasterizer/forward.cu b/csrc/reference/cuda_rasterizer/forward.cu
deleted file mode 100644
index 1f2b1026a..000000000
--- a/csrc/reference/cuda_rasterizer/forward.cu
+++ /dev/null
@@ -1,477 +0,0 @@
-/*
- * Copyright (C) 2023, Inria
- * GRAPHDECO research group, https://team.inria.fr/graphdeco
- * All rights reserved.
- *
- * This software is free for non-commercial, research and evaluation use 
- * under the terms of the LICENSE.md file.
- *
- * For inquiries contact  george.drettakis@inria.fr
- */
-
-#include "forward.h"
-#include "auxiliary.h"
-#include <cooperative_groups.h>
-#include <cooperative_groups/reduce.h>
-namespace cg = cooperative_groups;
-
-// Forward method for converting the input spherical harmonics
-// coefficients of each Gaussian to a simple RGB color.
-__device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped)
-{
-	// The implementation is loosely based on code for 
-	// "Differentiable Point-Based Radiance Fields for 
-	// Efficient View Synthesis" by Zhang et al. (2022)
-	glm::vec3 pos = means[idx];
-	glm::vec3 dir = pos - campos;
-	dir = dir / glm::length(dir);
-
-	glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
-	glm::vec3 result = SH_C0 * sh[0];
-
-	if (deg > 0)
-	{
-		float x = dir.x;
-		float y = dir.y;
-		float z = dir.z;
-		result = result - SH_C1 * y * sh[1] + SH_C1 * z * sh[2] - SH_C1 * x * sh[3];
-
-		if (deg > 1)
-		{
-			float xx = x * x, yy = y * y, zz = z * z;
-			float xy = x * y, yz = y * z, xz = x * z;
-			result = result +
-				SH_C2[0] * xy * sh[4] +
-				SH_C2[1] * yz * sh[5] +
-				SH_C2[2] * (2.0f * zz - xx - yy) * sh[6] +
-				SH_C2[3] * xz * sh[7] +
-				SH_C2[4] * (xx - yy) * sh[8];
-
-			if (deg > 2)
-			{
-				result = result +
-					SH_C3[0] * y * (3.0f * xx - yy) * sh[9] +
-					SH_C3[1] * xy * z * sh[10] +
-					SH_C3[2] * y * (4.0f * zz - xx - yy) * sh[11] +
-					SH_C3[3] * z * (2.0f * zz - 3.0f * xx - 3.0f * yy) * sh[12] +
-					SH_C3[4] * x * (4.0f * zz - xx - yy) * sh[13] +
-					SH_C3[5] * z * (xx - yy) * sh[14] +
-					SH_C3[6] * x * (xx - 3.0f * yy) * sh[15];
-			}
-		}
-	}
-	result += 0.5f;
-
-	// RGB colors are clamped to positive values. If values are
-	// clamped, we need to keep track of this for the backward pass.
-	clamped[3 * idx + 0] = (result.x < 0);
-	clamped[3 * idx + 1] = (result.y < 0);
-	clamped[3 * idx + 2] = (result.z < 0);
-	return glm::max(result, 0.0f);
-}
-
-// Forward version of 2D covariance matrix computation
-__host__ __device__ float3 computeCov2D(const float3& mean, float focal_x, float focal_y, float tan_fovx, float tan_fovy, const float* cov3D, const float* viewmatrix)
-{
-	// The following models the steps outlined by equations 29
-	// and 31 in "EWA Splatting" (Zwicker et al., 2002). 
-	// Additionally considers aspect / scaling of viewport.
-	// Transposes used to account for row-/column-major conventions.
-	float3 t = transformPoint4x3(mean, viewmatrix);
-    // printf("ref %f %f %f\n", t.x, t.y, t.z);
-
-    const float limx = 1.3f * tan_fovx;
-    const float limy = 1.3f * tan_fovy;
-	const float txtz = t.x / t.z;
-	const float tytz = t.y / t.z;
-	t.x = min(limx, max(-limx, txtz)) * t.z;
-	t.y = min(limy, max(-limy, tytz)) * t.z;
-    // printf("ref clipped %f %f %f\n", t.x, t.y, t.z);
-
-	glm::mat3 J = glm::mat3(
-		focal_x / t.z, 0.0f, -(focal_x * t.x) / (t.z * t.z),
-		0.0f, focal_y / t.z, -(focal_y * t.y) / (t.z * t.z),
-		0, 0, 0);
-    // printf("ref J\n %f %f %f\n %f %f %f\n",
-    //     J[0][0], J[1][0], J[2][0],
-    //     J[0][1], J[1][1], J[2][1]
-    // );
-
-	glm::mat3 W = glm::mat3(
-		viewmatrix[0], viewmatrix[4], viewmatrix[8],
-		viewmatrix[1], viewmatrix[5], viewmatrix[9],
-		viewmatrix[2], viewmatrix[6], viewmatrix[10]);
-
-	glm::mat3 T = W * J;
-
-    // printf("ref T\n %f %f %f\n %f %f %f\n %f %f %f\n",
-    //     T[0][0], T[1][0], T[2][0],
-    //     T[0][1], T[1][1], T[2][1],
-    //     T[0][2], T[1][2], T[2][2]
-    // );
-
-	glm::mat3 Vrk = glm::mat3(
-		cov3D[0], cov3D[1], cov3D[2],
-		cov3D[1], cov3D[3], cov3D[4],
-		cov3D[2], cov3D[4], cov3D[5]);
-
-    // printf("ref Vrk\n %f %f %f\n %f %f %f\n %f %f %f\n",
-    //     Vrk[0][0], Vrk[1][0], Vrk[2][0],
-    //     Vrk[0][1], Vrk[1][1], Vrk[2][1],
-    //     Vrk[0][2], Vrk[1][2], Vrk[2][2]
-    // );
-	glm::mat3 cov = glm::transpose(T) * glm::transpose(Vrk) * T;
-    // printf("ref cov\n %f %f %f\n %f %f %f\n %f %f %f\n",
-    //     cov[0][0], cov[1][0], cov[2][0],
-    //     cov[0][1], cov[1][1], cov[2][1],
-    //     cov[0][2], cov[1][2], cov[2][2]
-    // );
-
-	// Apply low-pass filter: every Gaussian should be at least
-	// one pixel wide/high. Discard 3rd row and column.
-	cov[0][0] += 0.3f;
-	cov[1][1] += 0.3f;
-	return { float(cov[0][0]), float(cov[0][1]), float(cov[1][1]) };
-}
-
-// Forward method for converting scale and rotation properties of each
-// Gaussian to a 3D covariance matrix in world space. Also takes care
-// of quaternion normalization.
-__host__ __device__ void computeCov3D(const glm::vec3 scale, float mod, const glm::vec4 rot, float* cov3D)
-{
-	// Create scaling matrix
-	glm::mat3 S = glm::mat3(1.0f);
-	S[0][0] = mod * scale.x;
-	S[1][1] = mod * scale.y;
-	S[2][2] = mod * scale.z;
-
-	// Normalize quaternion to get valid rotation
-	glm::vec4 q = rot;// / glm::length(rot);
-	float r = q.x;
-	float x = q.y;
-	float y = q.z;
-	float z = q.w;
-
-	// Compute rotation matrix from quaternion
-	glm::mat3 R = glm::mat3(
-		1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y),
-		2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x),
-		2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y)
-	);
-
-	glm::mat3 M = S * R;
-
-	// Compute 3D world covariance matrix Sigma
-	glm::mat3 Sigma = glm::transpose(M) * M;
-
-	// Covariance is symmetric, only store upper right
-	cov3D[0] = Sigma[0][0];
-	cov3D[1] = Sigma[0][1];
-	cov3D[2] = Sigma[0][2];
-	cov3D[3] = Sigma[1][1];
-	cov3D[4] = Sigma[1][2];
-	cov3D[5] = Sigma[2][2];
-}
-
-// Perform initial steps for each Gaussian prior to rasterization.
-template<int C>
-__global__ void preprocessCUDA(int P, int D, int M,
-	const float* orig_points,
-	const glm::vec3* scales,
-	const float scale_modifier,
-	const glm::vec4* rotations,
-	const float* opacities,
-	const float* shs,
-	bool* clamped,
-	const float* cov3D_precomp,
-	const float* colors_precomp,
-	const float* viewmatrix,
-	const float* projmatrix,
-	const glm::vec3* cam_pos,
-	const int W, int H,
-	const float tan_fovx, float tan_fovy,
-	const float focal_x, float focal_y,
-	int* radii,
-	float2* points_xy_image,
-	float* depths,
-	float* cov3Ds,
-	float* rgb,
-	float4* conic_opacity,
-	const dim3 grid,
-	uint32_t* tiles_touched,
-	bool prefiltered)
-{
-	auto idx = cg::this_grid().thread_rank();
-	if (idx >= P)
-		return;
-
-	// Initialize radius and touched tiles to 0. If this isn't changed,
-	// this Gaussian will not be processed further.
-	radii[idx] = 0;
-	tiles_touched[idx] = 0;
-
-	// Perform near culling, quit if outside.
-	float3 p_view;
-	if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view))
-		return;
-
-	// Transform point by projecting
-	float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
-	float4 p_hom = transformPoint4x4(p_orig, projmatrix);
-	float p_w = 1.0f / (p_hom.w + 0.0000001f);
-	float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
-
-	// If 3D covariance matrix is precomputed, use it, otherwise compute
-	// from scaling and rotation parameters. 
-	const float* cov3D;
-	if (cov3D_precomp != nullptr)
-	{
-		cov3D = cov3D_precomp + idx * 6;
-	}
-	else
-	{
-		computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6);
-		cov3D = cov3Ds + idx * 6;
-	}
-
-	// Compute 2D screen-space covariance matrix
-	float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D, viewmatrix);
-
-	// Invert covariance (EWA algorithm)
-	float det = (cov.x * cov.z - cov.y * cov.y);
-	if (det == 0.0f)
-		return;
-	float det_inv = 1.f / det;
-	float3 conic = { cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv };
-
-	// Compute extent in screen space (by finding eigenvalues of
-	// 2D covariance matrix). Use extent to compute a bounding rectangle
-	// of screen-space tiles that this Gaussian overlaps with. Quit if
-	// rectangle covers 0 tiles. 
-	float mid = 0.5f * (cov.x + cov.z);
-	float lambda1 = mid + sqrt(max(0.1f, mid * mid - det));
-	float lambda2 = mid - sqrt(max(0.1f, mid * mid - det));
-	float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2)));
-	float2 point_image = { ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H) };
-	uint2 rect_min, rect_max;
-	getRect(point_image, my_radius, rect_min, rect_max, grid);
-	if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0)
-		return;
-
-	// If colors have been precomputed, use them, otherwise convert
-	// spherical harmonics coefficients to RGB color.
-	if (colors_precomp == nullptr)
-	{
-		glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped);
-		rgb[idx * C + 0] = result.x;
-		rgb[idx * C + 1] = result.y;
-		rgb[idx * C + 2] = result.z;
-	}
-
-	// Store some useful helper data for the next steps.
-	depths[idx] = p_view.z;
-	radii[idx] = my_radius;
-	points_xy_image[idx] = point_image;
-	// Inverse 2D covariance and opacity neatly pack into one float4
-	conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] };
-	tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x);
-}
-
-// Main rasterization method. Collaboratively works on one tile per
-// block, each thread treats one pixel. Alternates between fetching 
-// and rasterizing data.
-template <uint32_t CHANNELS>
-__global__ void __launch_bounds__(BLOCK_X * BLOCK_Y)
-renderCUDA(
-	const uint2* __restrict__ ranges,
-	const uint32_t* __restrict__ point_list,
-	int W, int H,
-	const float2* __restrict__ points_xy_image,
-	const float* __restrict__ features,
-	const float4* __restrict__ conic_opacity,
-	float* __restrict__ final_T,
-	uint32_t* __restrict__ n_contrib,
-	const float* __restrict__ bg_color,
-	float* __restrict__ out_color)
-{
-	// Identify current tile and associated min/max pixel range.
-	auto block = cg::this_thread_block();
-	uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
-	uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
-	uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
-	uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
-	uint32_t pix_id = W * pix.y + pix.x;
-	float2 pixf = { (float)pix.x, (float)pix.y };
-
-	// Check if this thread is associated with a valid pixel or outside.
-	bool inside = pix.x < W&& pix.y < H;
-	// Done threads can help with fetching, but don't rasterize
-	bool done = !inside;
-
-	// Load start/end range of IDs to process in bit sorted list.
-	uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
-	const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
-	int toDo = range.y - range.x;
-
-	// Allocate storage for batches of collectively fetched data.
-	__shared__ int collected_id[BLOCK_SIZE];
-	__shared__ float2 collected_xy[BLOCK_SIZE];
-	__shared__ float4 collected_conic_opacity[BLOCK_SIZE];
-
-	// Initialize helper variables
-	float T = 1.0f;
-	uint32_t contributor = 0;
-	uint32_t last_contributor = 0;
-	float C[CHANNELS] = { 0 };
-
-	// Iterate over batches until all done or range is complete
-	for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
-	{
-		// End if entire block votes that it is done rasterizing
-		int num_done = __syncthreads_count(done);
-		if (num_done == BLOCK_SIZE)
-			break;
-
-		// Collectively fetch per-Gaussian data from global to shared
-		int progress = i * BLOCK_SIZE + block.thread_rank();
-		if (range.x + progress < range.y)
-		{
-			int coll_id = point_list[range.x + progress];
-			collected_id[block.thread_rank()] = coll_id;
-			collected_xy[block.thread_rank()] = points_xy_image[coll_id];
-			collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
-		}
-		block.sync();
-
-		// Iterate over current batch
-		for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++)
-		{
-			// Keep track of current position in range
-			contributor++;
-
-			// Resample using conic matrix (cf. "Surface 
-			// Splatting" by Zwicker et al., 2001)
-			float2 xy = collected_xy[j];
-			float2 d = { xy.x - pixf.x, xy.y - pixf.y };
-			float4 con_o = collected_conic_opacity[j];
-			float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
-			if (power > 0.0f)
-				continue;
-
-			// Eq. (2) from 3D Gaussian splatting paper.
-			// Obtain alpha by multiplying with Gaussian opacity
-			// and its exponential falloff from mean.
-			// Avoid numerical instabilities (see paper appendix). 
-			float alpha = min(0.99f, con_o.w * exp(power));
-			if (alpha < 1.0f / 255.0f)
-				continue;
-			float test_T = T * (1 - alpha);
-			if (test_T < 0.0001f)
-			{
-				done = true;
-				continue;
-			}
-
-			// Eq. (3) from 3D Gaussian splatting paper.
-			for (int ch = 0; ch < CHANNELS; ch++)
-				C[ch] += features[collected_id[j] * CHANNELS + ch] * alpha * T;
-
-			T = test_T;
-
-			// Keep track of last range entry to update this
-			// pixel.
-			last_contributor = contributor;
-		}
-	}
-
-	// All threads that treat valid pixel write out their final
-	// rendering data to the frame and auxiliary buffers.
-	if (inside)
-	{
-		final_T[pix_id] = T;
-		n_contrib[pix_id] = last_contributor;
-		for (int ch = 0; ch < CHANNELS; ch++)
-			out_color[ch * H * W + pix_id] = C[ch] + T * bg_color[ch];
-	}
-}
-
-void FORWARD::render(
-	const dim3 grid, dim3 block,
-	const uint2* ranges,
-	const uint32_t* point_list,
-	int W, int H,
-	const float2* means2D,
-	const float* colors,
-	const float4* conic_opacity,
-	float* final_T,
-	uint32_t* n_contrib,
-	const float* bg_color,
-	float* out_color)
-{
-	renderCUDA<NUM_CHANNELS> << <grid, block >> > (
-		ranges,
-		point_list,
-		W, H,
-		means2D,
-		colors,
-		conic_opacity,
-		final_T,
-		n_contrib,
-		bg_color,
-		out_color);
-}
-
-void FORWARD::preprocess(int P, int D, int M,
-	const float* means3D,
-	const glm::vec3* scales,
-	const float scale_modifier,
-	const glm::vec4* rotations,
-	const float* opacities,
-	const float* shs,
-	bool* clamped,
-	const float* cov3D_precomp,
-	const float* colors_precomp,
-	const float* viewmatrix,
-	const float* projmatrix,
-	const glm::vec3* cam_pos,
-	const int W, int H,
-	const float focal_x, float focal_y,
-	const float tan_fovx, float tan_fovy,
-	int* radii,
-	float2* means2D,
-	float* depths,
-	float* cov3Ds,
-	float* rgb,
-	float4* conic_opacity,
-	const dim3 grid,
-	uint32_t* tiles_touched,
-	bool prefiltered)
-{
-	preprocessCUDA<NUM_CHANNELS> << <(P + 255) / 256, 256 >> > (
-		P, D, M,
-		means3D,
-		scales,
-		scale_modifier,
-		rotations,
-		opacities,
-		shs,
-		clamped,
-		cov3D_precomp,
-		colors_precomp,
-		viewmatrix, 
-		projmatrix,
-		cam_pos,
-		W, H,
-		tan_fovx, tan_fovy,
-		focal_x, focal_y,
-		radii,
-		means2D,
-		depths,
-		cov3Ds,
-		rgb,
-		conic_opacity,
-		grid,
-		tiles_touched,
-		prefiltered
-		);
-}
diff --git a/csrc/reference/cuda_rasterizer/forward.h b/csrc/reference/cuda_rasterizer/forward.h
deleted file mode 100644
index 3c11cb917..000000000
--- a/csrc/reference/cuda_rasterizer/forward.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (C) 2023, Inria
- * GRAPHDECO research group, https://team.inria.fr/graphdeco
- * All rights reserved.
- *
- * This software is free for non-commercial, research and evaluation use 
- * under the terms of the LICENSE.md file.
- *
- * For inquiries contact  george.drettakis@inria.fr
- */
-
-#ifndef CUDA_RASTERIZER_FORWARD_H_INCLUDED
-#define CUDA_RASTERIZER_FORWARD_H_INCLUDED
-
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-#define GLM_FORCE_CUDA
-#include <glm/glm.hpp>
-
-namespace FORWARD
-{
-	// Perform initial steps for each Gaussian prior to rasterization.
-	void preprocess(int P, int D, int M,
-		const float* orig_points,
-		const glm::vec3* scales,
-		const float scale_modifier,
-		const glm::vec4* rotations,
-		const float* opacities,
-		const float* shs,
-		bool* clamped,
-		const float* cov3D_precomp,
-		const float* colors_precomp,
-		const float* viewmatrix,
-		const float* projmatrix,
-		const glm::vec3* cam_pos,
-		const int W, int H,
-		const float focal_x, float focal_y,
-		const float tan_fovx, float tan_fovy,
-		int* radii,
-		float2* points_xy_image,
-		float* depths,
-		float* cov3Ds,
-		float* colors,
-		float4* conic_opacity,
-		const dim3 grid,
-		uint32_t* tiles_touched,
-		bool prefiltered);
-
-	// Main rasterization method.
-	void render(
-		const dim3 grid, dim3 block,
-		const uint2* ranges,
-		const uint32_t* point_list,
-		int W, int H,
-		const float2* points_xy_image,
-		const float* features,
-		const float4* conic_opacity,
-		float* final_T,
-		uint32_t* n_contrib,
-		const float* bg_color,
-		float* out_color);
-}
-
-
-#endif
\ No newline at end of file
diff --git a/csrc/reference/cuda_rasterizer/rasterizer.h b/csrc/reference/cuda_rasterizer/rasterizer.h
deleted file mode 100644
index 81544ef61..000000000
--- a/csrc/reference/cuda_rasterizer/rasterizer.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (C) 2023, Inria
- * GRAPHDECO research group, https://team.inria.fr/graphdeco
- * All rights reserved.
- *
- * This software is free for non-commercial, research and evaluation use 
- * under the terms of the LICENSE.md file.
- *
- * For inquiries contact  george.drettakis@inria.fr
- */
-
-#ifndef CUDA_RASTERIZER_H_INCLUDED
-#define CUDA_RASTERIZER_H_INCLUDED
-
-#include <vector>
-#include <functional>
-
-namespace CudaRasterizer
-{
-	class Rasterizer
-	{
-	public:
-
-		static void markVisible(
-			int P,
-			float* means3D,
-			float* viewmatrix,
-			float* projmatrix,
-			bool* present);
-
-		static int forward(
-			std::function<char* (size_t)> geometryBuffer,
-			std::function<char* (size_t)> binningBuffer,
-			std::function<char* (size_t)> imageBuffer,
-			const int P, int D, int M,
-			const float* background,
-			const int width, int height,
-			const float* means3D,
-			const float* shs,
-			const float* colors_precomp,
-			const float* opacities,
-			const float* scales,
-			const float scale_modifier,
-			const float* rotations,
-			const float* cov3D_precomp,
-			const float* viewmatrix,
-			const float* projmatrix,
-			const float* cam_pos,
-			const float tan_fovx, float tan_fovy,
-			const bool prefiltered,
-			float* out_color,
-			int* radii = nullptr,
-			bool debug = false);
-
-		static void backward(
-			const int P, int D, int M, int R,
-			const float* background,
-			const int width, int height,
-			const float* means3D,
-			const float* shs,
-			const float* colors_precomp,
-			const float* scales,
-			const float scale_modifier,
-			const float* rotations,
-			const float* cov3D_precomp,
-			const float* viewmatrix,
-			const float* projmatrix,
-			const float* campos,
-			const float tan_fovx, float tan_fovy,
-			const int* radii,
-			char* geom_buffer,
-			char* binning_buffer,
-			char* image_buffer,
-			const float* dL_dpix,
-			float* dL_dmean2D,
-			float* dL_dconic,
-			float* dL_dopacity,
-			float* dL_dcolor,
-			float* dL_dmean3D,
-			float* dL_dcov3D,
-			float* dL_dsh,
-			float* dL_dscale,
-			float* dL_drot,
-			bool debug);
-	};
-};
-
-#endif
\ No newline at end of file
diff --git a/csrc/reference/cuda_rasterizer/rasterizer_impl.cu b/csrc/reference/cuda_rasterizer/rasterizer_impl.cu
deleted file mode 100644
index f8782ac43..000000000
--- a/csrc/reference/cuda_rasterizer/rasterizer_impl.cu
+++ /dev/null
@@ -1,434 +0,0 @@
-/*
- * Copyright (C) 2023, Inria
- * GRAPHDECO research group, https://team.inria.fr/graphdeco
- * All rights reserved.
- *
- * This software is free for non-commercial, research and evaluation use 
- * under the terms of the LICENSE.md file.
- *
- * For inquiries contact  george.drettakis@inria.fr
- */
-
-#include "rasterizer_impl.h"
-#include <iostream>
-#include <fstream>
-#include <algorithm>
-#include <numeric>
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-#include <cub/cub.cuh>
-#include <cub/device/device_radix_sort.cuh>
-#define GLM_FORCE_CUDA
-#include <glm/glm.hpp>
-
-#include <cooperative_groups.h>
-#include <cooperative_groups/reduce.h>
-namespace cg = cooperative_groups;
-
-#include "auxiliary.h"
-#include "forward.h"
-#include "backward.h"
-
-// Helper function to find the next-highest bit of the MSB
-// on the CPU.
-uint32_t getHigherMsb(uint32_t n)
-{
-	uint32_t msb = sizeof(n) * 4;
-	uint32_t step = msb;
-	while (step > 1)
-	{
-		step /= 2;
-		if (n >> msb)
-			msb += step;
-		else
-			msb -= step;
-	}
-	if (n >> msb)
-		msb++;
-	return msb;
-}
-
-// Wrapper method to call auxiliary coarse frustum containment test.
-// Mark all Gaussians that pass it.
-__global__ void checkFrustum(int P,
-	const float* orig_points,
-	const float* viewmatrix,
-	const float* projmatrix,
-	bool* present)
-{
-	auto idx = cg::this_grid().thread_rank();
-	if (idx >= P)
-		return;
-
-	float3 p_view;
-	present[idx] = in_frustum(idx, orig_points, viewmatrix, projmatrix, false, p_view);
-}
-
-// Generates one key/value pair for all Gaussian / tile overlaps. 
-// Run once per Gaussian (1:N mapping).
-__global__ void duplicateWithKeys(
-	int P,
-	const float2* points_xy,
-	const float* depths,
-	const uint32_t* offsets,
-	uint64_t* gaussian_keys_unsorted,
-	uint32_t* gaussian_values_unsorted,
-	int* radii,
-	dim3 grid)
-{
-	auto idx = cg::this_grid().thread_rank();
-	if (idx >= P)
-		return;
-
-	// Generate no key/value pair for invisible Gaussians
-	if (radii[idx] > 0)
-	{
-		// Find this Gaussian's offset in buffer for writing keys/values.
-		uint32_t off = (idx == 0) ? 0 : offsets[idx - 1];
-		uint2 rect_min, rect_max;
-
-		getRect(points_xy[idx], radii[idx], rect_min, rect_max, grid);
-
-		// For each tile that the bounding rect overlaps, emit a 
-		// key/value pair. The key is |  tile ID  |      depth      |,
-		// and the value is the ID of the Gaussian. Sorting the values 
-		// with this key yields Gaussian IDs in a list, such that they
-		// are first sorted by tile and then by depth. 
-		for (int y = rect_min.y; y < rect_max.y; y++)
-		{
-			for (int x = rect_min.x; x < rect_max.x; x++)
-			{
-				uint64_t key = y * grid.x + x;
-				key <<= 32;
-				key |= *((uint32_t*)&depths[idx]);
-				gaussian_keys_unsorted[off] = key;
-				gaussian_values_unsorted[off] = idx;
-				off++;
-			}
-		}
-	}
-}
-
-// Check keys to see if it is at the start/end of one tile's range in 
-// the full sorted list. If yes, write start/end of this tile. 
-// Run once per instanced (duplicated) Gaussian ID.
-__global__ void identifyTileRanges(int L, uint64_t* point_list_keys, uint2* ranges)
-{
-	auto idx = cg::this_grid().thread_rank();
-	if (idx >= L)
-		return;
-
-	// Read tile ID from key. Update start/end of tile range if at limit.
-	uint64_t key = point_list_keys[idx];
-	uint32_t currtile = key >> 32;
-	if (idx == 0)
-		ranges[currtile].x = 0;
-	else
-	{
-		uint32_t prevtile = point_list_keys[idx - 1] >> 32;
-		if (currtile != prevtile)
-		{
-			ranges[prevtile].y = idx;
-			ranges[currtile].x = idx;
-		}
-	}
-	if (idx == L - 1)
-		ranges[currtile].y = L;
-}
-
-// Mark Gaussians as visible/invisible, based on view frustum testing
-void CudaRasterizer::Rasterizer::markVisible(
-	int P,
-	float* means3D,
-	float* viewmatrix,
-	float* projmatrix,
-	bool* present)
-{
-	checkFrustum << <(P + 255) / 256, 256 >> > (
-		P,
-		means3D,
-		viewmatrix, projmatrix,
-		present);
-}
-
-CudaRasterizer::GeometryState CudaRasterizer::GeometryState::fromChunk(char*& chunk, size_t P)
-{
-	GeometryState geom;
-	obtain(chunk, geom.depths, P, 128);
-	obtain(chunk, geom.clamped, P * 3, 128);
-	obtain(chunk, geom.internal_radii, P, 128);
-	obtain(chunk, geom.means2D, P, 128);
-	obtain(chunk, geom.cov3D, P * 6, 128);
-	obtain(chunk, geom.conic_opacity, P, 128);
-	obtain(chunk, geom.rgb, P * 3, 128);
-	obtain(chunk, geom.tiles_touched, P, 128);
-	cub::DeviceScan::InclusiveSum(nullptr, geom.scan_size, geom.tiles_touched, geom.tiles_touched, P);
-	obtain(chunk, geom.scanning_space, geom.scan_size, 128);
-	obtain(chunk, geom.point_offsets, P, 128);
-	return geom;
-}
-
-CudaRasterizer::ImageState CudaRasterizer::ImageState::fromChunk(char*& chunk, size_t N)
-{
-	ImageState img;
-	obtain(chunk, img.accum_alpha, N, 128);
-	obtain(chunk, img.n_contrib, N, 128);
-	obtain(chunk, img.ranges, N, 128);
-	return img;
-}
-
-CudaRasterizer::BinningState CudaRasterizer::BinningState::fromChunk(char*& chunk, size_t P)
-{
-	BinningState binning;
-	obtain(chunk, binning.point_list, P, 128);
-	obtain(chunk, binning.point_list_unsorted, P, 128);
-	obtain(chunk, binning.point_list_keys, P, 128);
-	obtain(chunk, binning.point_list_keys_unsorted, P, 128);
-	cub::DeviceRadixSort::SortPairs(
-		nullptr, binning.sorting_size,
-		binning.point_list_keys_unsorted, binning.point_list_keys,
-		binning.point_list_unsorted, binning.point_list, P);
-	obtain(chunk, binning.list_sorting_space, binning.sorting_size, 128);
-	return binning;
-}
-
-// Forward rendering procedure for differentiable rasterization
-// of Gaussians.
-int CudaRasterizer::Rasterizer::forward(
-	std::function<char* (size_t)> geometryBuffer,
-	std::function<char* (size_t)> binningBuffer,
-	std::function<char* (size_t)> imageBuffer,
-	const int P, int D, int M,
-	const float* background,
-	const int width, int height,
-	const float* means3D,
-	const float* shs,
-	const float* colors_precomp,
-	const float* opacities,
-	const float* scales,
-	const float scale_modifier,
-	const float* rotations,
-	const float* cov3D_precomp,
-	const float* viewmatrix,
-	const float* projmatrix,
-	const float* cam_pos,
-	const float tan_fovx, float tan_fovy,
-	const bool prefiltered,
-	float* out_color,
-	int* radii,
-	bool debug)
-{
-	const float focal_y = height / (2.0f * tan_fovy);
-	const float focal_x = width / (2.0f * tan_fovx);
-
-	size_t chunk_size = required<GeometryState>(P);
-	char* chunkptr = geometryBuffer(chunk_size);
-	GeometryState geomState = GeometryState::fromChunk(chunkptr, P);
-
-	if (radii == nullptr)
-	{
-		radii = geomState.internal_radii;
-	}
-
-	dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
-	dim3 block(BLOCK_X, BLOCK_Y, 1);
-
-	// Dynamically resize image-based auxiliary buffers during training
-	size_t img_chunk_size = required<ImageState>(width * height);
-	char* img_chunkptr = imageBuffer(img_chunk_size);
-	ImageState imgState = ImageState::fromChunk(img_chunkptr, width * height);
-
-	if (NUM_CHANNELS != 3 && colors_precomp == nullptr)
-	{
-		throw std::runtime_error("For non-RGB, provide precomputed Gaussian colors!");
-	}
-
-	// Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB)
-	CHECK_CUDA(FORWARD::preprocess(
-		P, D, M,
-		means3D,
-		(glm::vec3*)scales,
-		scale_modifier,
-		(glm::vec4*)rotations,
-		opacities,
-		shs,
-		geomState.clamped,
-		cov3D_precomp,
-		colors_precomp,
-		viewmatrix, projmatrix,
-		(glm::vec3*)cam_pos,
-		width, height,
-		focal_x, focal_y,
-		tan_fovx, tan_fovy,
-		radii,
-		geomState.means2D,
-		geomState.depths,
-		geomState.cov3D,
-		geomState.rgb,
-		geomState.conic_opacity,
-		tile_grid,
-		geomState.tiles_touched,
-		prefiltered
-	), debug)
-
-	// Compute prefix sum over full list of touched tile counts by Gaussians
-	// E.g., [2, 3, 0, 2, 1] -> [2, 5, 5, 7, 8]
-	CHECK_CUDA(cub::DeviceScan::InclusiveSum(geomState.scanning_space, geomState.scan_size, geomState.tiles_touched, geomState.point_offsets, P), debug)
-
-	// Retrieve total number of Gaussian instances to launch and resize aux buffers
-	int num_rendered;
-	CHECK_CUDA(cudaMemcpy(&num_rendered, geomState.point_offsets + P - 1, sizeof(int), cudaMemcpyDeviceToHost), debug);
-
-	size_t binning_chunk_size = required<BinningState>(num_rendered);
-	char* binning_chunkptr = binningBuffer(binning_chunk_size);
-	BinningState binningState = BinningState::fromChunk(binning_chunkptr, num_rendered);
-
-	// For each instance to be rendered, produce adequate [ tile | depth ] key 
-	// and corresponding dublicated Gaussian indices to be sorted
-	duplicateWithKeys << <(P + 255) / 256, 256 >> > (
-		P,
-		geomState.means2D,
-		geomState.depths,
-		geomState.point_offsets,
-		binningState.point_list_keys_unsorted,
-		binningState.point_list_unsorted,
-		radii,
-		tile_grid)
-	CHECK_CUDA(, debug)
-
-	int bit = getHigherMsb(tile_grid.x * tile_grid.y);
-
-	// Sort complete list of (duplicated) Gaussian indices by keys
-	CHECK_CUDA(cub::DeviceRadixSort::SortPairs(
-		binningState.list_sorting_space,
-		binningState.sorting_size,
-		binningState.point_list_keys_unsorted, binningState.point_list_keys,
-		binningState.point_list_unsorted, binningState.point_list,
-		num_rendered, 0, 32 + bit), debug)
-
-	CHECK_CUDA(cudaMemset(imgState.ranges, 0, tile_grid.x * tile_grid.y * sizeof(uint2)), debug);
-
-	// Identify start and end of per-tile workloads in sorted list
-	if (num_rendered > 0)
-		identifyTileRanges << <(num_rendered + 255) / 256, 256 >> > (
-			num_rendered,
-			binningState.point_list_keys,
-			imgState.ranges);
-	CHECK_CUDA(, debug)
-
-	// Let each tile blend its range of Gaussians independently in parallel
-	const float* feature_ptr = colors_precomp != nullptr ? colors_precomp : geomState.rgb;
-	CHECK_CUDA(FORWARD::render(
-		tile_grid, block,
-		imgState.ranges,
-		binningState.point_list,
-		width, height,
-		geomState.means2D,
-		feature_ptr,
-		geomState.conic_opacity,
-		imgState.accum_alpha,
-		imgState.n_contrib,
-		background,
-		out_color), debug)
-
-	return num_rendered;
-}
-
-// Produce necessary gradients for optimization, corresponding
-// to forward render pass
-void CudaRasterizer::Rasterizer::backward(
-	const int P, int D, int M, int R,
-	const float* background,
-	const int width, int height,
-	const float* means3D,
-	const float* shs,
-	const float* colors_precomp,
-	const float* scales,
-	const float scale_modifier,
-	const float* rotations,
-	const float* cov3D_precomp,
-	const float* viewmatrix,
-	const float* projmatrix,
-	const float* campos,
-	const float tan_fovx, float tan_fovy,
-	const int* radii,
-	char* geom_buffer,
-	char* binning_buffer,
-	char* img_buffer,
-	const float* dL_dpix,
-	float* dL_dmean2D,
-	float* dL_dconic,
-	float* dL_dopacity,
-	float* dL_dcolor,
-	float* dL_dmean3D,
-	float* dL_dcov3D,
-	float* dL_dsh,
-	float* dL_dscale,
-	float* dL_drot,
-	bool debug)
-{
-	GeometryState geomState = GeometryState::fromChunk(geom_buffer, P);
-	BinningState binningState = BinningState::fromChunk(binning_buffer, R);
-	ImageState imgState = ImageState::fromChunk(img_buffer, width * height);
-
-	if (radii == nullptr)
-	{
-		radii = geomState.internal_radii;
-	}
-
-	const float focal_y = height / (2.0f * tan_fovy);
-	const float focal_x = width / (2.0f * tan_fovx);
-
-	const dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
-	const dim3 block(BLOCK_X, BLOCK_Y, 1);
-
-	// Compute loss gradients w.r.t. 2D mean position, conic matrix,
-	// opacity and RGB of Gaussians from per-pixel loss gradients.
-	// If we were given precomputed colors and not SHs, use them.
-	const float* color_ptr = (colors_precomp != nullptr) ? colors_precomp : geomState.rgb;
-	CHECK_CUDA(BACKWARD::render(
-		tile_grid,
-		block,
-		imgState.ranges,
-		binningState.point_list,
-		width, height,
-		background,
-		geomState.means2D,
-		geomState.conic_opacity,
-		color_ptr,
-		imgState.accum_alpha,
-		imgState.n_contrib,
-		dL_dpix,
-		(float3*)dL_dmean2D,
-		(float4*)dL_dconic,
-		dL_dopacity,
-		dL_dcolor), debug)
-
-	// Take care of the rest of preprocessing. Was the precomputed covariance
-	// given to us or a scales/rot pair? If precomputed, pass that. If not,
-	// use the one we computed ourselves.
-	const float* cov3D_ptr = (cov3D_precomp != nullptr) ? cov3D_precomp : geomState.cov3D;
-	CHECK_CUDA(BACKWARD::preprocess(P, D, M,
-		(float3*)means3D,
-		radii,
-		shs,
-		geomState.clamped,
-		(glm::vec3*)scales,
-		(glm::vec4*)rotations,
-		scale_modifier,
-		cov3D_ptr,
-		viewmatrix,
-		projmatrix,
-		focal_x, focal_y,
-		tan_fovx, tan_fovy,
-		(glm::vec3*)campos,
-		(float3*)dL_dmean2D,
-		dL_dconic,
-		(glm::vec3*)dL_dmean3D,
-		dL_dcolor,
-		dL_dcov3D,
-		dL_dsh,
-		(glm::vec3*)dL_dscale,
-		(glm::vec4*)dL_drot), debug)
-}
\ No newline at end of file
diff --git a/csrc/reference/cuda_rasterizer/rasterizer_impl.h b/csrc/reference/cuda_rasterizer/rasterizer_impl.h
deleted file mode 100644
index bc3f0ece7..000000000
--- a/csrc/reference/cuda_rasterizer/rasterizer_impl.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (C) 2023, Inria
- * GRAPHDECO research group, https://team.inria.fr/graphdeco
- * All rights reserved.
- *
- * This software is free for non-commercial, research and evaluation use 
- * under the terms of the LICENSE.md file.
- *
- * For inquiries contact  george.drettakis@inria.fr
- */
-
-#pragma once
-
-#include <iostream>
-#include <vector>
-#include "rasterizer.h"
-#include <cuda_runtime_api.h>
-
-namespace CudaRasterizer
-{
-	template <typename T>
-	static void obtain(char*& chunk, T*& ptr, std::size_t count, std::size_t alignment)
-	{
-		std::size_t offset = (reinterpret_cast<std::uintptr_t>(chunk) + alignment - 1) & ~(alignment - 1);
-		ptr = reinterpret_cast<T*>(offset);
-		chunk = reinterpret_cast<char*>(ptr + count);
-	}
-
-	struct GeometryState
-	{
-		size_t scan_size;
-		float* depths;
-		char* scanning_space;
-		bool* clamped;
-		int* internal_radii;
-		float2* means2D;
-		float* cov3D;
-		float4* conic_opacity;
-		float* rgb;
-		uint32_t* point_offsets;
-		uint32_t* tiles_touched;
-
-		static GeometryState fromChunk(char*& chunk, size_t P);
-	};
-
-	struct ImageState
-	{
-		uint2* ranges;
-		uint32_t* n_contrib;
-		float* accum_alpha;
-
-		static ImageState fromChunk(char*& chunk, size_t N);
-	};
-
-	struct BinningState
-	{
-		size_t sorting_size;
-		uint64_t* point_list_keys_unsorted;
-		uint64_t* point_list_keys;
-		uint32_t* point_list_unsorted;
-		uint32_t* point_list;
-		char* list_sorting_space;
-
-		static BinningState fromChunk(char*& chunk, size_t P);
-	};
-
-	template<typename T> 
-	size_t required(size_t P)
-	{
-		char* size = nullptr;
-		T::fromChunk(size, P);
-		return ((size_t)size) + 128;
-	}
-};
\ No newline at end of file
diff --git a/csrc/reference/ext.cpp b/csrc/reference/ext.cpp
deleted file mode 100644
index 2c4a771d6..000000000
--- a/csrc/reference/ext.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (C) 2023, Inria
- * GRAPHDECO research group, https://team.inria.fr/graphdeco
- * All rights reserved.
- *
- * This software is free for non-commercial, research and evaluation use 
- * under the terms of the LICENSE.md file.
- *
- * For inquiries contact  george.drettakis@inria.fr
- */
-
-#include <torch/extension.h>
-#include "rasterize_points.h"
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("rasterize_gaussians", &RasterizeGaussiansCUDA);
-  m.def("rasterize_gaussians_backward", &RasterizeGaussiansBackwardCUDA);
-  m.def("mark_visible", &markVisible);
-}
diff --git a/csrc/reference/rasterize_points.cu b/csrc/reference/rasterize_points.cu
deleted file mode 100644
index fceef53c3..000000000
--- a/csrc/reference/rasterize_points.cu
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Copyright (C) 2023, Inria
- * GRAPHDECO research group, https://team.inria.fr/graphdeco
- * All rights reserved.
- *
- * This software is free for non-commercial, research and evaluation use 
- * under the terms of the LICENSE.md file.
- *
- * For inquiries contact  george.drettakis@inria.fr
- */
-
-#include <math.h>
-#include <torch/extension.h>
-#include <cstdio>
-#include <sstream>
-#include <iostream>
-#include <tuple>
-#include <stdio.h>
-#include <cuda_runtime_api.h>
-#include <memory>
-#include <fstream>
-#include <string>
-#include <functional>
-
-#include "cuda_rasterizer/config.h"
-#include "cuda_rasterizer/rasterizer.h"
-
-
-std::function<char*(size_t N)> resizeFunctional(torch::Tensor& t) {
-    auto lambda = [&t](size_t N) {
-        t.resize_({(long long)N});
-		return reinterpret_cast<char*>(t.contiguous().data_ptr());
-    };
-    return lambda;
-}
-
-std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
-RasterizeGaussiansCUDA(
-	const torch::Tensor& background,
-	const torch::Tensor& means3D,
-    const torch::Tensor& colors,
-    const torch::Tensor& opacity,
-	const torch::Tensor& scales,
-	const torch::Tensor& rotations,
-	const float scale_modifier,
-	const torch::Tensor& cov3D_precomp,
-	const torch::Tensor& viewmatrix,
-	const torch::Tensor& projmatrix,
-	const float tan_fovx, 
-	const float tan_fovy,
-    const int image_height,
-    const int image_width,
-	const torch::Tensor& sh,
-	const int degree,
-	const torch::Tensor& campos,
-	const bool prefiltered,
-	const bool debug)
-{
-  if (means3D.ndimension() != 2 || means3D.size(1) != 3) {
-    AT_ERROR("means3D must have dimensions (num_points, 3)");
-  }
-  
-  const int P = means3D.size(0);
-  const int H = image_height;
-  const int W = image_width;
-
-  auto int_opts = means3D.options().dtype(torch::kInt32);
-  auto float_opts = means3D.options().dtype(torch::kFloat32);
-
-  torch::Tensor out_color = torch::full({NUM_CHANNELS, H, W}, 0.0, float_opts);
-  torch::Tensor radii = torch::full({P}, 0, means3D.options().dtype(torch::kInt32));
-  
-  torch::Device device(torch::kCUDA);
-  torch::TensorOptions options(torch::kByte);
-  torch::Tensor geomBuffer = torch::empty({0}, options.device(device));
-  torch::Tensor binningBuffer = torch::empty({0}, options.device(device));
-  torch::Tensor imgBuffer = torch::empty({0}, options.device(device));
-  std::function<char*(size_t)> geomFunc = resizeFunctional(geomBuffer);
-  std::function<char*(size_t)> binningFunc = resizeFunctional(binningBuffer);
-  std::function<char*(size_t)> imgFunc = resizeFunctional(imgBuffer);
-  
-  int rendered = 0;
-  if(P != 0)
-  {
-	  int M = 0;
-	  if(sh.size(0) != 0)
-	  {
-		M = sh.size(1);
-      }
-
-	  rendered = CudaRasterizer::Rasterizer::forward(
-	    geomFunc,
-		binningFunc,
-		imgFunc,
-	    P, degree, M,
-		background.contiguous().data<float>(),
-		W, H,
-		means3D.contiguous().data<float>(),
-		sh.contiguous().data_ptr<float>(),
-		colors.contiguous().data<float>(), 
-		opacity.contiguous().data<float>(), 
-		scales.contiguous().data_ptr<float>(),
-		scale_modifier,
-		rotations.contiguous().data_ptr<float>(),
-		cov3D_precomp.contiguous().data<float>(), 
-		viewmatrix.contiguous().data<float>(), 
-		projmatrix.contiguous().data<float>(),
-		campos.contiguous().data<float>(),
-		tan_fovx,
-		tan_fovy,
-		prefiltered,
-		out_color.contiguous().data<float>(),
-		radii.contiguous().data<int>(),
-		debug);
-  }
-  return std::make_tuple(rendered, out_color, radii, geomBuffer, binningBuffer, imgBuffer);
-}
-
-std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
- RasterizeGaussiansBackwardCUDA(
- 	const torch::Tensor& background,
-	const torch::Tensor& means3D,
-	const torch::Tensor& radii,
-    const torch::Tensor& colors,
-	const torch::Tensor& scales,
-	const torch::Tensor& rotations,
-	const float scale_modifier,
-	const torch::Tensor& cov3D_precomp,
-	const torch::Tensor& viewmatrix,
-    const torch::Tensor& projmatrix,
-	const float tan_fovx,
-	const float tan_fovy,
-    const torch::Tensor& dL_dout_color,
-	const torch::Tensor& sh,
-	const int degree,
-	const torch::Tensor& campos,
-	const torch::Tensor& geomBuffer,
-	const int R,
-	const torch::Tensor& binningBuffer,
-	const torch::Tensor& imageBuffer,
-	const bool debug) 
-{
-  const int P = means3D.size(0);
-  const int H = dL_dout_color.size(1);
-  const int W = dL_dout_color.size(2);
-  
-  int M = 0;
-  if(sh.size(0) != 0)
-  {	
-	M = sh.size(1);
-  }
-
-  torch::Tensor dL_dmeans3D = torch::zeros({P, 3}, means3D.options());
-  torch::Tensor dL_dmeans2D = torch::zeros({P, 3}, means3D.options());
-  torch::Tensor dL_dcolors = torch::zeros({P, NUM_CHANNELS}, means3D.options());
-  torch::Tensor dL_dconic = torch::zeros({P, 2, 2}, means3D.options());
-  torch::Tensor dL_dopacity = torch::zeros({P, 1}, means3D.options());
-  torch::Tensor dL_dcov3D = torch::zeros({P, 6}, means3D.options());
-  torch::Tensor dL_dsh = torch::zeros({P, M, 3}, means3D.options());
-  torch::Tensor dL_dscales = torch::zeros({P, 3}, means3D.options());
-  torch::Tensor dL_drotations = torch::zeros({P, 4}, means3D.options());
-  
-  if(P != 0)
-  {  
-	  CudaRasterizer::Rasterizer::backward(P, degree, M, R,
-	  background.contiguous().data<float>(),
-	  W, H, 
-	  means3D.contiguous().data<float>(),
-	  sh.contiguous().data<float>(),
-	  colors.contiguous().data<float>(),
-	  scales.data_ptr<float>(),
-	  scale_modifier,
-	  rotations.data_ptr<float>(),
-	  cov3D_precomp.contiguous().data<float>(),
-	  viewmatrix.contiguous().data<float>(),
-	  projmatrix.contiguous().data<float>(),
-	  campos.contiguous().data<float>(),
-	  tan_fovx,
-	  tan_fovy,
-	  radii.contiguous().data<int>(),
-	  reinterpret_cast<char*>(geomBuffer.contiguous().data_ptr()),
-	  reinterpret_cast<char*>(binningBuffer.contiguous().data_ptr()),
-	  reinterpret_cast<char*>(imageBuffer.contiguous().data_ptr()),
-	  dL_dout_color.contiguous().data<float>(),
-	  dL_dmeans2D.contiguous().data<float>(),
-	  dL_dconic.contiguous().data<float>(),  
-	  dL_dopacity.contiguous().data<float>(),
-	  dL_dcolors.contiguous().data<float>(),
-	  dL_dmeans3D.contiguous().data<float>(),
-	  dL_dcov3D.contiguous().data<float>(),
-	  dL_dsh.contiguous().data<float>(),
-	  dL_dscales.contiguous().data<float>(),
-	  dL_drotations.contiguous().data<float>(),
-	  debug);
-  }
-
-  return std::make_tuple(dL_dmeans2D, dL_dcolors, dL_dopacity, dL_dmeans3D, dL_dcov3D, dL_dsh, dL_dscales, dL_drotations);
-}
-
-torch::Tensor markVisible(
-		torch::Tensor& means3D,
-		torch::Tensor& viewmatrix,
-		torch::Tensor& projmatrix)
-{ 
-  const int P = means3D.size(0);
-  
-  torch::Tensor present = torch::full({P}, false, means3D.options().dtype(at::kBool));
- 
-  if(P != 0)
-  {
-	CudaRasterizer::Rasterizer::markVisible(P,
-		means3D.contiguous().data<float>(),
-		viewmatrix.contiguous().data<float>(),
-		projmatrix.contiguous().data<float>(),
-		present.contiguous().data<bool>());
-  }
-  
-  return present;
-}
diff --git a/csrc/reference/rasterize_points.h b/csrc/reference/rasterize_points.h
deleted file mode 100644
index 9023d994b..000000000
--- a/csrc/reference/rasterize_points.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (C) 2023, Inria
- * GRAPHDECO research group, https://team.inria.fr/graphdeco
- * All rights reserved.
- *
- * This software is free for non-commercial, research and evaluation use 
- * under the terms of the LICENSE.md file.
- *
- * For inquiries contact  george.drettakis@inria.fr
- */
-
-#pragma once
-#include <torch/extension.h>
-#include <cstdio>
-#include <tuple>
-#include <string>
-	
-std::tuple<int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
-RasterizeGaussiansCUDA(
-	const torch::Tensor& background,
-	const torch::Tensor& means3D,
-    const torch::Tensor& colors,
-    const torch::Tensor& opacity,
-	const torch::Tensor& scales,
-	const torch::Tensor& rotations,
-	const float scale_modifier,
-	const torch::Tensor& cov3D_precomp,
-	const torch::Tensor& viewmatrix,
-	const torch::Tensor& projmatrix,
-	const float tan_fovx, 
-	const float tan_fovy,
-    const int image_height,
-    const int image_width,
-	const torch::Tensor& sh,
-	const int degree,
-	const torch::Tensor& campos,
-	const bool prefiltered,
-	const bool debug);
-
-std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
- RasterizeGaussiansBackwardCUDA(
- 	const torch::Tensor& background,
-	const torch::Tensor& means3D,
-	const torch::Tensor& radii,
-    const torch::Tensor& colors,
-	const torch::Tensor& scales,
-	const torch::Tensor& rotations,
-	const float scale_modifier,
-	const torch::Tensor& cov3D_precomp,
-	const torch::Tensor& viewmatrix,
-    const torch::Tensor& projmatrix,
-	const float tan_fovx, 
-	const float tan_fovy,
-    const torch::Tensor& dL_dout_color,
-	const torch::Tensor& sh,
-	const int degree,
-	const torch::Tensor& campos,
-	const torch::Tensor& geomBuffer,
-	const int R,
-	const torch::Tensor& binningBuffer,
-	const torch::Tensor& imageBuffer,
-	const bool debug);
-		
-torch::Tensor markVisible(
-		torch::Tensor& means3D,
-		torch::Tensor& viewmatrix,
-		torch::Tensor& projmatrix);
\ No newline at end of file
diff --git a/csrc/test_forward.cu b/csrc/test_forward.cu
deleted file mode 100644
index 9fda9d11f..000000000
--- a/csrc/test_forward.cu
+++ /dev/null
@@ -1,404 +0,0 @@
-#include <cstdlib>
-#include <cstring>
-#include <ctime>
-#include <iostream>
-#include <sstream>
-#include <math.h>
-
-#include "config.h"
-#include "forward.cuh"
-#include "tgaimage.h"
-
-float random_float() { return (float)std::rand() / RAND_MAX; }
-
-
-float4 random_quat() {
-    float u = random_float();
-    float v = random_float();
-    float w = random_float();
-    return {
-        sqrt(1.f - u) * sin(2.f * (float) M_PI * v),
-        sqrt(1.f - u) * cos(2.f * (float) M_PI * v),
-        sqrt(u) * sin(2.f * (float) M_PI * w),
-        sqrt(u) * cos(2.f * (float) M_PI * w)
-    };
-}
-
-int main(int argc, char *argv[]) {
-    int num_points = 256;
-    int n = 16;
-    if (argc > 1) {
-        num_points = std::stoi(argv[1]);
-    }
-    printf("rendering %d points\n", num_points);
-
-    float bd = 2.f;
-    if (argc > 2) {
-        bd = std::stof(argv[2]);
-    }
-    printf("using scene bounds %.2f\n", bd);
-
-    int mode = 0;
-    if (argc > 3) {  // debug
-        mode = std::stoi(argv[3]);
-    }
-    printf("%d mode\n", mode);
-    const float fov_x = M_PI / 2.f;
-    const int W = 512;
-    const int H = 512;
-    const float focal = 0.5 * (float)W / tan(0.5 * fov_x);
-    const dim3 tile_bounds = {
-        (W + BLOCK_X - 1) / BLOCK_X, (H + BLOCK_Y - 1) / BLOCK_Y, 1};
-    const dim3 img_size = {W, H, 1};
-    const dim3 block = {BLOCK_X, BLOCK_Y, 1};
-
-    int num_cov3d = num_points * 6;
-    int num_view = 16;
-    const int C = 3;
-
-    float3 *means = new float3[num_points];
-    float3 *scales = new float3[num_points];
-    float4 *quats = new float4[num_points];
-    float *rgbs = new float[C * num_points];
-    float *opacities = new float[num_points];
-    // world to camera matrix
-    float viewmat[] = {
-        1.f,
-        0.f,
-        0.f,
-        0.f,
-        0.f,
-        1.f,
-        0.f,
-        0.f,
-        0.f,
-        0.f,
-        1.f,
-        10.f,
-        0.f,
-        0.f,
-        0.f,
-        1.f};
-
-    // random initialization of gaussians
-    std::srand(std::time(nullptr));
-    if (mode == 0) {
-        for (int i = 0; i < num_points; ++i) {
-            float v = (float)i - (float)num_points * 0.5f;
-            means[i] = {v * 0.1f, v * 0.1f, (float)i * 0.1f};
-            printf("%d, %.2f %.2f %.2f\n", i, means[i].x, means[i].y, means[i].z);
-            scales[i] = {1.f, 0.5f, 1.f};
-            for (int c = 0; c < C; ++c) {
-                rgbs[C * i + c] = (float)(i % C == c);
-            }
-            quats[i] = {1.f, 0.f, 0.f, 0.f};
-            opacities[i] = 0.9f;
-        }
-    } else if (mode == 1) {
-        for (int i = 0; i < num_points; ++i) {
-            means[i] = {
-                bd * (random_float() - 0.5f),
-                bd * (random_float() - 0.5f),
-                bd * (random_float() - 0.5f)
-            };
-            printf("%d, %.2f %.2f %.2f\n", i, means[i].x, means[i].y, means[i].z);
-            scales[i] = {random_float(), random_float(), random_float()};
-            for (int c = 0; c < C; ++c) {
-                rgbs[C * i + c] = random_float();
-            }
-            quats[i] = {1.f, 0.f, 0.f, 0.f};
-            opacities[i] = 0.9f;
-        }
-    } else {
-        for (int i = 0; i < n; ++i) {
-            for (int j = 0; j < n; ++j) {
-                int idx = n * i + j;
-                float x = 2.f * ((float) j / (float) n - 0.5f);
-                float y = 2.f * ((float) i / (float) n - 0.5f);
-                // float z = 2.f * (float) idx / (float) num_points;
-                float z = 5.f + random_float();
-                // float z = (float) idx;
-                means[idx] = {x, y, z};
-                printf("%d, %.2f %.2f %.2f\n", idx, x, y, z);
-                scales[idx] = {0.5f, 0.5f, 0.5f};
-                for (int c = 0; c < C; ++c) {
-                    // rgbs[C * idx + c] = (float)(i % C == c);
-                    rgbs[C * idx + c] = random_float();
-                }
-                quats[idx] = {0.f, 0.f, 0.f, 1.f};  // w x y z convention
-                opacities[idx] = 0.9f;
-            }
-        }
-    }
-
-    float3 *scales_d, *means_d;
-    float *rgbs_d;
-    float4 *quats_d;
-    float *viewmat_d, *opacities_d;
-
-    cudaMalloc((void **)&scales_d, num_points * sizeof(float3));
-    cudaMalloc((void **)&means_d, num_points * sizeof(float3));
-    cudaMalloc((void **)&quats_d, num_points * sizeof(float4));
-    cudaMalloc((void **)&rgbs_d, C * num_points * sizeof(float));
-    cudaMalloc((void **)&opacities_d, num_points * sizeof(float));
-    cudaMalloc((void **)&viewmat_d, num_view * sizeof(float));
-
-    cudaMemcpy(
-        scales_d, scales, num_points * sizeof(float3), cudaMemcpyHostToDevice
-    );
-    cudaMemcpy(
-        means_d, means, num_points * sizeof(float3), cudaMemcpyHostToDevice
-    );
-    cudaMemcpy(
-        rgbs_d, rgbs, C * num_points * sizeof(float), cudaMemcpyHostToDevice
-    );
-    cudaMemcpy(
-        opacities_d,
-        opacities,
-        num_points * sizeof(float),
-        cudaMemcpyHostToDevice
-    );
-    cudaMemcpy(
-        quats_d, quats, num_points * sizeof(float4), cudaMemcpyHostToDevice
-    );
-    cudaMemcpy(
-        viewmat_d, viewmat, num_view * sizeof(float), cudaMemcpyHostToDevice
-    );
-
-    // allocate memory for outputs
-    float *covs3d = new float[num_cov3d];
-    float2 *xy = new float2[num_points];
-    float *z = new float[num_points];
-    int *radii = new int[num_points];
-    float3 *conics = new float3[num_points];
-    int32_t *num_tiles_hit = new int32_t[num_points];
-
-    float *covs3d_d, *z_d;
-    float2 *xy_d;
-    float3 *conics_d;
-    int *radii_d;
-    int32_t *num_tiles_hit_d;
-    cudaMalloc((void **)&covs3d_d, num_cov3d * sizeof(float));
-    cudaMalloc((void **)&xy_d, num_points * sizeof(float2));
-    cudaMalloc((void **)&z_d, num_points * sizeof(float));
-    cudaMalloc((void **)&radii_d, num_points * sizeof(int));
-    cudaMalloc((void **)&conics_d, num_points * sizeof(float3));
-    cudaMalloc((void **)&num_tiles_hit_d, num_points * sizeof(int32_t));
-
-    printf("projecting into 2d...\n");
-    project_gaussians_forward_impl(
-        num_points,
-        means_d,
-        scales_d,
-        1.f,
-        quats_d,
-        viewmat_d,
-        viewmat_d,
-        focal,
-        focal,
-        img_size,
-        tile_bounds,
-        covs3d_d,
-        xy_d,
-        z_d,
-        radii_d,
-        conics_d,
-        num_tiles_hit_d
-    );
-    cudaMemcpy(
-        covs3d, covs3d_d, num_cov3d * sizeof(float), cudaMemcpyDeviceToHost
-    );
-    cudaMemcpy(xy, xy_d, num_points * sizeof(float2), cudaMemcpyDeviceToHost);
-    cudaMemcpy(z, z_d, num_points * sizeof(float), cudaMemcpyDeviceToHost);
-    cudaMemcpy(
-        radii, radii_d, num_points * sizeof(int), cudaMemcpyDeviceToHost
-    );
-    cudaMemcpy(
-        num_tiles_hit,
-        num_tiles_hit_d,
-        num_points * sizeof(int32_t),
-        cudaMemcpyDeviceToHost
-    );
-
-    printf("binning and sorting...\n");
-    int32_t num_intersects;
-    int32_t *cum_tiles_hit = new int32_t[num_points];
-    int32_t *cum_tiles_hit_d;
-    cudaMalloc((void **)&cum_tiles_hit_d, num_points * sizeof(int32_t));
-    compute_cumulative_intersects(
-        num_points, num_tiles_hit_d, num_intersects, cum_tiles_hit_d
-    );
-    printf("num_intersects %d\n", num_intersects);
-    // cudaMemcpy(cum_tiles_hit, cum_tiles_hit_d, num_points * sizeof(int32_t),
-    // cudaMemcpyDeviceToHost); for (int i = 0; i < num_points; ++i) {
-    //     printf("num_tiles_hit %d, %d\n", i, num_tiles_hit[i]);
-    // }
-
-    int64_t *isect_ids_sorted_d;
-    int32_t *gaussian_ids_sorted_d; // sorted by tile and depth
-    int64_t *isect_ids_sorted = new int64_t[num_intersects];
-    int32_t *gaussian_ids_sorted = new int32_t[num_intersects];
-    cudaMalloc((void **)&isect_ids_sorted_d, num_intersects * sizeof(int64_t));
-    cudaMalloc(
-        (void **)&gaussian_ids_sorted_d, num_intersects * sizeof(int32_t)
-    );
-
-    int64_t *isect_ids_unsorted_d;
-    int32_t *gaussian_ids_unsorted_d; // sorted by tile and depth
-    int64_t *isect_ids_unsorted = new int64_t[num_intersects];
-    int32_t *gaussian_ids_unsorted = new int32_t[num_intersects];
-    cudaMalloc(
-        (void **)&isect_ids_unsorted_d, num_intersects * sizeof(int64_t)
-    );
-    cudaMalloc(
-        (void **)&gaussian_ids_unsorted_d, num_intersects * sizeof(int32_t)
-    );
-
-    int num_tiles = tile_bounds.x * tile_bounds.y;
-    int2 *tile_bins_d; // start and end indices for each tile
-    int2 *tile_bins = new int2[num_tiles];
-    cudaMalloc((void **)&tile_bins_d, num_tiles * sizeof(int2));
-
-    bin_and_sort_gaussians(
-        num_points,
-        num_intersects,
-        xy_d,
-        z_d,
-        radii_d,
-        cum_tiles_hit_d,
-        tile_bounds,
-        isect_ids_unsorted_d,
-        gaussian_ids_unsorted_d,
-        isect_ids_sorted_d,
-        gaussian_ids_sorted_d,
-        tile_bins_d
-    );
-
-    // cudaMemcpy(
-    //     isect_ids_unsorted,
-    //     isect_ids_unsorted_d,
-    //     num_intersects * sizeof(int64_t),
-    //     cudaMemcpyDeviceToHost
-    // );
-    // cudaMemcpy(
-    //     gaussian_ids_unsorted,
-    //     gaussian_ids_unsorted_d,
-    //     num_intersects * sizeof(int32_t),
-    //     cudaMemcpyDeviceToHost
-    // );
-    // cudaMemcpy(
-    //     isect_ids_sorted,
-    //     isect_ids_sorted_d,
-    //     num_intersects * sizeof(int64_t),
-    //     cudaMemcpyDeviceToHost
-    // );
-    cudaMemcpy(
-        gaussian_ids_sorted,
-        gaussian_ids_sorted_d,
-        num_intersects * sizeof(int32_t),
-        cudaMemcpyDeviceToHost
-    );
-    //
-    // for (int i = 0; i < num_intersects; ++i) {
-    //     printf("%d unsorted isect %016lx point %03d\n", i,
-    //     isect_ids_unsorted[i], gaussian_ids_unsorted[i]);
-    // }
-    // for (int i = 0; i < num_intersects; ++i) {
-    //     printf("sorted isect %016lx point %03d\n", isect_ids_sorted[i],
-    //     gaussian_ids_sorted[i]);
-    // }
-    // cudaMemcpy(tile_bins, tile_bins_d, num_tiles * sizeof(int2),
-    // cudaMemcpyDeviceToHost); for (int i = 0; i < num_tiles; ++i) {
-    //     printf("tile_bins %d %d %d\t", i, tile_bins[i].x, tile_bins[i].y);
-    // }
-    // std::cout << std::endl;
-
-    float *out_img = new float[C * W * H];
-    float *out_img_d;
-    float *final_Ts_d;
-    int *final_idx_d;
-    cudaMalloc((void **)&out_img_d, C * W * H * sizeof(float));
-    cudaMalloc((void **)&final_Ts_d, W * H * sizeof(float));
-    cudaMalloc((void **)&final_idx_d, W * H * sizeof(int));
-
-    // const float background[3] = {1.0f, 1.0f, 1.0f};
-    const float background[3] = {0.0f, 0.0f, 0.0f};
-    float *background_d;
-    cudaMalloc((void **)&background_d, C * sizeof(float));
-    cudaMemcpy(
-        background_d, background, C * sizeof(float), cudaMemcpyHostToDevice
-    );
-
-    printf("final rasterize pass\n");
-    rasterize_forward_impl (
-        tile_bounds,
-        block,
-        img_size,
-        C,
-        gaussian_ids_sorted_d,
-        tile_bins_d,
-        xy_d,
-        conics_d,
-        rgbs_d,
-        opacities_d,
-        final_Ts_d,
-        final_idx_d,
-        out_img_d,
-        background_d
-    );
-    cudaMemcpy(
-        out_img, out_img_d, C * W * H * sizeof(float), cudaMemcpyDeviceToHost
-    );
-    TGAImage image(W, H, TGAImage::RGB);
-    int idx;
-    float3 c;
-    TGAColor col;
-    for (int y = 0; y < H; ++y) {
-        for (int x = 0; x < W; ++x) {
-            idx = y * W + x;
-            for (int c = 0; c < C; ++c) {
-                col[c] = (uint8_t)(255.f * out_img[C * idx + c]);
-            }
-            col[3] = 255;
-            image.set(x, y, col);
-        }
-    }
-    image.write_tga_file("output.tga");
-
-    printf("freeing memory...\n");
-
-    cudaFree(scales_d);
-    cudaFree(quats_d);
-    cudaFree(rgbs_d);
-    cudaFree(opacities_d);
-    cudaFree(covs3d_d);
-    cudaFree(viewmat_d);
-    cudaFree(xy_d);
-    cudaFree(z_d);
-    cudaFree(radii_d);
-    cudaFree(num_tiles_hit_d);
-    cudaFree(cum_tiles_hit_d);
-    cudaFree(tile_bins_d);
-    cudaFree(isect_ids_unsorted_d);
-    cudaFree(gaussian_ids_unsorted_d);
-    cudaFree(isect_ids_sorted_d);
-    cudaFree(gaussian_ids_sorted_d);
-    cudaFree(conics_d);
-    cudaFree(out_img_d);
-
-    delete[] scales;
-    delete[] means;
-    delete[] rgbs;
-    delete[] opacities;
-    delete[] quats;
-    delete[] covs3d;
-    delete[] xy;
-    delete[] z;
-    delete[] radii;
-    delete[] num_tiles_hit;
-    delete[] cum_tiles_hit;
-    delete[] tile_bins;
-    delete[] gaussian_ids_sorted;
-    delete[] out_img;
-    return 0;
-}
diff --git a/csrc/tgaimage.cpp b/csrc/tgaimage.cpp
deleted file mode 100644
index d9164716f..000000000
--- a/csrc/tgaimage.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-#include "tgaimage.h"
-#include <cstring>
-#include <iostream>
-
-TGAImage::TGAImage(const int w, const int h, const int bpp)
-    : w(w), h(h), bpp(bpp), data(w * h * bpp, 0) {}
-
-bool TGAImage::read_tga_file(const std::string filename) {
-    std::ifstream in;
-    in.open(filename, std::ios::binary);
-    if (!in.is_open()) {
-        std::cerr << "can't open file " << filename << "\n";
-        return false;
-    }
-    TGAHeader header;
-    in.read(reinterpret_cast<char *>(&header), sizeof(header));
-    if (!in.good()) {
-        std::cerr << "an error occured while reading the header\n";
-        return false;
-    }
-    w = header.width;
-    h = header.height;
-    bpp = header.bitsperpixel >> 3;
-    if (w <= 0 || h <= 0 || (bpp != GRAYSCALE && bpp != RGB && bpp != RGBA)) {
-        std::cerr << "bad bpp (or width/height) value\n";
-        return false;
-    }
-    size_t nbytes = bpp * w * h;
-    data = std::vector<std::uint8_t>(nbytes, 0);
-    if (3 == header.datatypecode || 2 == header.datatypecode) {
-        in.read(reinterpret_cast<char *>(data.data()), nbytes);
-        if (!in.good()) {
-            std::cerr << "an error occured while reading the data\n";
-            return false;
-        }
-    } else if (10 == header.datatypecode || 11 == header.datatypecode) {
-        if (!load_rle_data(in)) {
-            std::cerr << "an error occured while reading the data\n";
-            return false;
-        }
-    } else {
-        std::cerr << "unknown file format " << (int)header.datatypecode << "\n";
-        return false;
-    }
-    if (!(header.imagedescriptor & 0x20))
-        flip_vertically();
-    if (header.imagedescriptor & 0x10)
-        flip_horizontally();
-    std::cerr << w << "x" << h << "/" << bpp * 8 << "\n";
-    return true;
-}
-
-bool TGAImage::load_rle_data(std::ifstream &in) {
-    size_t pixelcount = w * h;
-    size_t currentpixel = 0;
-    size_t currentbyte = 0;
-    TGAColor colorbuffer;
-    do {
-        std::uint8_t chunkheader = 0;
-        chunkheader = in.get();
-        if (!in.good()) {
-            std::cerr << "an error occured while reading the data\n";
-            return false;
-        }
-        if (chunkheader < 128) {
-            chunkheader++;
-            for (int i = 0; i < chunkheader; i++) {
-                in.read(reinterpret_cast<char *>(colorbuffer.bgra), bpp);
-                if (!in.good()) {
-                    std::cerr << "an error occured while reading the header\n";
-                    return false;
-                }
-                for (int t = 0; t < bpp; t++)
-                    data[currentbyte++] = colorbuffer.bgra[t];
-                currentpixel++;
-                if (currentpixel > pixelcount) {
-                    std::cerr << "Too many pixels read\n";
-                    return false;
-                }
-            }
-        } else {
-            chunkheader -= 127;
-            in.read(reinterpret_cast<char *>(colorbuffer.bgra), bpp);
-            if (!in.good()) {
-                std::cerr << "an error occured while reading the header\n";
-                return false;
-            }
-            for (int i = 0; i < chunkheader; i++) {
-                for (int t = 0; t < bpp; t++)
-                    data[currentbyte++] = colorbuffer.bgra[t];
-                currentpixel++;
-                if (currentpixel > pixelcount) {
-                    std::cerr << "Too many pixels read\n";
-                    return false;
-                }
-            }
-        }
-    } while (currentpixel < pixelcount);
-    return true;
-}
-
-bool TGAImage::write_tga_file(
-    const std::string filename, const bool vflip, const bool rle
-) const {
-    constexpr std::uint8_t developer_area_ref[4] = {0, 0, 0, 0};
-    constexpr std::uint8_t extension_area_ref[4] = {0, 0, 0, 0};
-    constexpr std::uint8_t footer[18] = {
-        'T',
-        'R',
-        'U',
-        'E',
-        'V',
-        'I',
-        'S',
-        'I',
-        'O',
-        'N',
-        '-',
-        'X',
-        'F',
-        'I',
-        'L',
-        'E',
-        '.',
-        '\0'};
-    std::ofstream out;
-    out.open(filename, std::ios::binary);
-    if (!out.is_open()) {
-        std::cerr << "can't open file " << filename << "\n";
-        return false;
-    }
-    TGAHeader header = {};
-    header.bitsperpixel = bpp << 3;
-    header.width = w;
-    header.height = h;
-    header.datatypecode = (bpp == GRAYSCALE ? (rle ? 11 : 3) : (rle ? 10 : 2));
-    header.imagedescriptor =
-        vflip ? 0x00 : 0x20; // top-left or bottom-left origin
-    out.write(reinterpret_cast<const char *>(&header), sizeof(header));
-    if (!out.good()) {
-        std::cerr << "can't dump the tga file\n";
-        return false;
-    }
-    if (!rle) {
-        out.write(reinterpret_cast<const char *>(data.data()), w * h * bpp);
-        if (!out.good()) {
-            std::cerr << "can't unload raw data\n";
-            return false;
-        }
-    } else if (!unload_rle_data(out)) {
-        std::cerr << "can't unload rle data\n";
-        return false;
-    }
-    out.write(
-        reinterpret_cast<const char *>(developer_area_ref),
-        sizeof(developer_area_ref)
-    );
-    if (!out.good()) {
-        std::cerr << "can't dump the tga file\n";
-        return false;
-    }
-    out.write(
-        reinterpret_cast<const char *>(extension_area_ref),
-        sizeof(extension_area_ref)
-    );
-    if (!out.good()) {
-        std::cerr << "can't dump the tga file\n";
-        return false;
-    }
-    out.write(reinterpret_cast<const char *>(footer), sizeof(footer));
-    if (!out.good()) {
-        std::cerr << "can't dump the tga file\n";
-        return false;
-    }
-    return true;
-}
-
-// TODO: it is not necessary to break a raw chunk for two equal pixels (for the
-// matter of the resulting size)
-bool TGAImage::unload_rle_data(std::ofstream &out) const {
-    const std::uint8_t max_chunk_length = 128;
-    size_t npixels = w * h;
-    size_t curpix = 0;
-    while (curpix < npixels) {
-        size_t chunkstart = curpix * bpp;
-        size_t curbyte = curpix * bpp;
-        std::uint8_t run_length = 1;
-        bool raw = true;
-        while (curpix + run_length < npixels && run_length < max_chunk_length) {
-            bool succ_eq = true;
-            for (int t = 0; succ_eq && t < bpp; t++)
-                succ_eq = (data[curbyte + t] == data[curbyte + t + bpp]);
-            curbyte += bpp;
-            if (1 == run_length)
-                raw = !succ_eq;
-            if (raw && succ_eq) {
-                run_length--;
-                break;
-            }
-            if (!raw && !succ_eq)
-                break;
-            run_length++;
-        }
-        curpix += run_length;
-        out.put(raw ? run_length - 1 : run_length + 127);
-        if (!out.good()) {
-            std::cerr << "can't dump the tga file\n";
-            return false;
-        }
-        out.write(
-            reinterpret_cast<const char *>(data.data() + chunkstart),
-            (raw ? run_length * bpp : bpp)
-        );
-        if (!out.good()) {
-            std::cerr << "can't dump the tga file\n";
-            return false;
-        }
-    }
-    return true;
-}
-
-TGAColor TGAImage::get(const int x, const int y) const {
-    if (!data.size() || x < 0 || y < 0 || x >= w || y >= h)
-        return {};
-    TGAColor ret = {0, 0, 0, 0, bpp};
-    const std::uint8_t *p = data.data() + (x + y * w) * bpp;
-    for (int i = bpp; i--; ret.bgra[i] = p[i])
-        ;
-    return ret;
-}
-
-void TGAImage::set(int x, int y, const TGAColor &c) {
-    if (!data.size() || x < 0 || y < 0 || x >= w || y >= h)
-        return;
-    memcpy(data.data() + (x + y * w) * bpp, c.bgra, bpp);
-}
-
-void TGAImage::flip_horizontally() {
-    int half = w >> 1;
-    for (int i = 0; i < half; i++)
-        for (int j = 0; j < h; j++)
-            for (int b = 0; b < bpp; b++)
-                std::swap(
-                    data[(i + j * w) * bpp + b],
-                    data[(w - 1 - i + j * w) * bpp + b]
-                );
-}
-
-void TGAImage::flip_vertically() {
-    int half = h >> 1;
-    for (int i = 0; i < w; i++)
-        for (int j = 0; j < half; j++)
-            for (int b = 0; b < bpp; b++)
-                std::swap(
-                    data[(i + j * w) * bpp + b],
-                    data[(i + (h - 1 - j) * w) * bpp + b]
-                );
-}
-
-int TGAImage::width() const { return w; }
-
-int TGAImage::height() const { return h; }
diff --git a/csrc/tgaimage.h b/csrc/tgaimage.h
deleted file mode 100644
index 9aa44b7e2..000000000
--- a/csrc/tgaimage.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#pragma once
-#include <cstdint>
-#include <fstream>
-#include <vector>
-
-#pragma pack(push, 1)
-struct TGAHeader {
-    std::uint8_t idlength = 0;
-    std::uint8_t colormaptype = 0;
-    std::uint8_t datatypecode = 0;
-    std::uint16_t colormaporigin = 0;
-    std::uint16_t colormaplength = 0;
-    std::uint8_t colormapdepth = 0;
-    std::uint16_t x_origin = 0;
-    std::uint16_t y_origin = 0;
-    std::uint16_t width = 0;
-    std::uint16_t height = 0;
-    std::uint8_t bitsperpixel = 0;
-    std::uint8_t imagedescriptor = 0;
-};
-#pragma pack(pop)
-
-struct TGAColor {
-    std::uint8_t bgra[4] = {0, 0, 0, 0};
-    std::uint8_t bytespp = 4;
-    std::uint8_t &operator[](const int i) { return bgra[i]; }
-};
-
-struct TGAImage {
-    enum Format { GRAYSCALE = 1, RGB = 3, RGBA = 4 };
-
-    TGAImage() = default;
-    TGAImage(const int w, const int h, const int bpp);
-    bool read_tga_file(const std::string filename);
-    bool write_tga_file(
-        const std::string filename,
-        const bool vflip = true,
-        const bool rle = true
-    ) const;
-    void flip_horizontally();
-    void flip_vertically();
-    TGAColor get(const int x, const int y) const;
-    void set(const int x, const int y, const TGAColor &c);
-    int width() const;
-    int height() const;
-
-  private:
-    bool load_rle_data(std::ifstream &in);
-    bool unload_rle_data(std::ofstream &out) const;
-
-    int w = 0;
-    int h = 0;
-    std::uint8_t bpp = 0;
-    std::vector<std::uint8_t> data = {};
-};
diff --git a/csrc/third_party/doctest b/csrc/third_party/doctest
deleted file mode 160000
index ae7a13539..000000000
--- a/csrc/third_party/doctest
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ae7a13539fb71f270b87eb2e874fbac80bc8dda2
diff --git a/diff_rast/__init__.py b/diff_rast/__init__.py
new file mode 100644
index 000000000..066567ca2
--- /dev/null
+++ b/diff_rast/__init__.py
@@ -0,0 +1,9 @@
+from .project_gaussians import ProjectGaussians
+from .rasterize import RasterizeGaussians
+from .version import __version__
+
+__all__ = [
+    "__version__",
+    "ProjectGaussians",
+    "RasterizeGaussians",
+]
diff --git a/diff_rast/_torch_impl.py b/diff_rast/_torch_impl.py
new file mode 100644
index 000000000..6dc3d9214
--- /dev/null
+++ b/diff_rast/_torch_impl.py
@@ -0,0 +1,109 @@
+"""Pure PyTorch implementation"""
+
+from jaxtyping import Float
+import torch
+from torch import Tensor
+
+
+def compute_sh_color(
+    viewdirs: Float[Tensor, "*batch 3"], sh_coeffs: Float[Tensor, "*batch D C"]
+):
+    """
+    :param viewdirs (*, C)
+    :param sh_coeffs (*, D, C) sh coefficients for each color channel
+    return colors (*, C)
+    """
+    *dims, dim_sh, C = sh_coeffs.shape
+    bases = eval_sh_bases(dim_sh, viewdirs)  # (*, dim_sh)
+    return (bases[..., None] * sh_coeffs).sum(dim=-2)
+
+
+"""
+Taken from https://github.com/sxyu/svox2
+"""
+
+SH_C0 = 0.28209479177387814
+SH_C1 = 0.4886025119029199
+SH_C2 = [
+    1.0925484305920792,
+    -1.0925484305920792,
+    0.31539156525252005,
+    -1.0925484305920792,
+    0.5462742152960396,
+]
+SH_C3 = [
+    -0.5900435899266435,
+    2.890611442640554,
+    -0.4570457994644658,
+    0.3731763325901154,
+    -0.4570457994644658,
+    1.445305721320277,
+    -0.5900435899266435,
+]
+SH_C4 = [
+    2.5033429417967046,
+    -1.7701307697799304,
+    0.9461746957575601,
+    -0.6690465435572892,
+    0.10578554691520431,
+    -0.6690465435572892,
+    0.47308734787878004,
+    -1.7701307697799304,
+    0.6258357354491761,
+]
+
+MAX_SH_BASIS = 10
+
+
+def eval_sh_bases(basis_dim: int, dirs: torch.Tensor):
+    """
+    Evaluate spherical harmonics bases at unit directions,
+    without taking linear combination.
+    At each point, the final result may the be
+    obtained through simple multiplication.
+
+    :param basis_dim: int SH basis dim. Currently, 1-25 square numbers supported
+    :param dirs: torch.Tensor (..., 3) unit directions
+
+    :return: torch.Tensor (..., basis_dim)
+    """
+    result = torch.empty(
+        (*dirs.shape[:-1], basis_dim), dtype=dirs.dtype, device=dirs.device
+    )
+    result[..., 0] = SH_C0
+    if basis_dim > 1:
+        x, y, z = dirs.unbind(-1)
+        result[..., 1] = -SH_C1 * y
+        result[..., 2] = SH_C1 * z
+        result[..., 3] = -SH_C1 * x
+        if basis_dim > 4:
+            xx, yy, zz = x * x, y * y, z * z
+            xy, yz, xz = x * y, y * z, x * z
+            result[..., 4] = SH_C2[0] * xy
+            result[..., 5] = SH_C2[1] * yz
+            result[..., 6] = SH_C2[2] * (2.0 * zz - xx - yy)
+            result[..., 7] = SH_C2[3] * xz
+            result[..., 8] = SH_C2[4] * (xx - yy)
+
+            if basis_dim > 9:
+                result[..., 9] = SH_C3[0] * y * (3 * xx - yy)
+                result[..., 10] = SH_C3[1] * xy * z
+                result[..., 11] = SH_C3[2] * y * (4 * zz - xx - yy)
+                result[..., 12] = SH_C3[3] * z * (2 * zz - 3 * xx - 3 * yy)
+                result[..., 13] = SH_C3[4] * x * (4 * zz - xx - yy)
+                result[..., 14] = SH_C3[5] * z * (xx - yy)
+                result[..., 15] = SH_C3[6] * x * (xx - 3 * yy)
+
+                if basis_dim > 16:
+                    result[..., 16] = SH_C4[0] * xy * (xx - yy)
+                    result[..., 17] = SH_C4[1] * yz * (3 * xx - yy)
+                    result[..., 18] = SH_C4[2] * xy * (7 * zz - 1)
+                    result[..., 19] = SH_C4[3] * yz * (7 * zz - 3)
+                    result[..., 20] = SH_C4[4] * (zz * (35 * zz - 30) + 3)
+                    result[..., 21] = SH_C4[5] * xz * (7 * zz - 3)
+                    result[..., 22] = SH_C4[6] * (xx - yy) * (7 * zz - 1)
+                    result[..., 23] = SH_C4[7] * xz * (xx - 3 * yy)
+                    result[..., 24] = SH_C4[8] * (
+                        xx * (xx - 3 * yy) - yy * (3 * xx - yy)
+                    )
+    return result
diff --git a/diff_rast/cuda/__init__.py b/diff_rast/cuda/__init__.py
new file mode 100644
index 000000000..f995b41fe
--- /dev/null
+++ b/diff_rast/cuda/__init__.py
@@ -0,0 +1,20 @@
+from typing import Callable
+
+
+def _make_lazy_cuda_func(name: str) -> Callable:
+    def call_cuda(*args, **kwargs):
+        # pylint: disable=import-outside-toplevel
+        from ._backend import _C
+
+        return getattr(_C, name)(*args, **kwargs)
+
+    return call_cuda
+
+
+rasterize_forward = _make_lazy_cuda_func("rasterize_forward")
+rasterize_backward = _make_lazy_cuda_func("rasterize_backward")
+compute_cov2d_bounds_forward = _make_lazy_cuda_func("compute_cov2d_bounds_forward")
+project_gaussians_forward = _make_lazy_cuda_func("project_gaussians_forward")
+project_gaussians_backward = _make_lazy_cuda_func("project_gaussians_backward")
+compute_sh_forward = _make_lazy_cuda_func("compute_sh_forward")
+compute_sh_backward = _make_lazy_cuda_func("compute_sh_backward")
diff --git a/diff_rast/cuda/_backend.py b/diff_rast/cuda/_backend.py
new file mode 100644
index 000000000..2d8ce35d0
--- /dev/null
+++ b/diff_rast/cuda/_backend.py
@@ -0,0 +1,92 @@
+import glob
+import json
+import os
+import shutil
+from subprocess import DEVNULL, call
+
+from rich.console import Console
+from torch.utils.cpp_extension import _get_build_directory, load
+
+PATH = os.path.dirname(os.path.abspath(__file__))
+
+
+def cuda_toolkit_available():
+    """Check if the nvcc is avaiable on the machine."""
+    try:
+        call(["nvcc"], stdout=DEVNULL, stderr=DEVNULL)
+        return True
+    except FileNotFoundError:
+        return False
+
+
+def cuda_toolkit_version():
+    """Get the cuda toolkit version."""
+    cuda_home = os.path.join(os.path.dirname(shutil.which("nvcc")), "..")
+    if os.path.exists(os.path.join(cuda_home, "version.txt")):
+        with open(os.path.join(cuda_home, "version.txt")) as f:
+            cuda_version = f.read().strip().split()[-1]
+    elif os.path.exists(os.path.join(cuda_home, "version.json")):
+        with open(os.path.join(cuda_home, "version.json")) as f:
+            cuda_version = json.load(f)["cuda"]["version"]
+    else:
+        raise RuntimeError("Cannot find the cuda version.")
+    return cuda_version
+
+
+name = "diff_rast_cuda"
+build_dir = _get_build_directory(name, verbose=False)
+extra_include_paths = [os.path.join(PATH, "csrc/third_party/glm")]
+extra_cflags = ["-O3"]
+extra_cuda_cflags = ["-O3"]
+
+_C = None
+sources = list(glob.glob(os.path.join(PATH, "csrc/*.cu"))) + list(
+    glob.glob(os.path.join(PATH, "csrc/*.cpp"))
+)
+# sources = [
+#     os.path.join(PATH, "csrc/ext.cpp"),
+#     os.path.join(PATH, "csrc/rasterize.cu"),
+#     os.path.join(PATH, "csrc/bindings.cu"),
+#     os.path.join(PATH, "csrc/forward.cu"),
+#     os.path.join(PATH, "csrc/backward.cu"),
+# ]
+
+try:
+    # try to import the compiled module (via setup.py)
+    from diff_rast import csrc as _C
+except ImportError:
+    # if failed, try with JIT compilation
+    if cuda_toolkit_available():
+        if os.listdir(build_dir) != []:
+            # If the build exists, we assume the extension has been built
+            # and we can load it.
+
+            _C = load(
+                name=name,
+                sources=sources,
+                extra_cflags=extra_cflags,
+                extra_cuda_cflags=extra_cuda_cflags,
+                extra_include_paths=extra_include_paths,
+            )
+        else:
+            # Build from scratch. Remove the build directory just to be safe: pytorch jit might stuck
+            # if the build directory exists.
+            shutil.rmtree(build_dir)
+            with Console().status(
+                "[bold yellow]DiffRast: Setting up CUDA (This may take a few minutes the first time)",
+                spinner="bouncingBall",
+            ):
+                _C = load(
+                    name=name,
+                    sources=sources,
+                    extra_cflags=extra_cflags,
+                    extra_cuda_cflags=extra_cuda_cflags,
+                    extra_include_paths=extra_include_paths,
+                )
+    else:
+        Console().print(
+            "[yellow]DiffRast: No CUDA toolkit found. DiffRast will be disabled.[/yellow]"
+        )
+
+
+__all__ = ["_C"]
diff --git a/csrc/CMakeLists.txt b/diff_rast/cuda/csrc/CMakeLists.txt
similarity index 100%
rename from csrc/CMakeLists.txt
rename to diff_rast/cuda/csrc/CMakeLists.txt
diff --git a/csrc/backward.cu b/diff_rast/cuda/csrc/backward.cu
similarity index 100%
rename from csrc/backward.cu
rename to diff_rast/cuda/csrc/backward.cu
diff --git a/csrc/backward.cuh b/diff_rast/cuda/csrc/backward.cuh
similarity index 100%
rename from csrc/backward.cuh
rename to diff_rast/cuda/csrc/backward.cuh
diff --git a/csrc/bindings.cu b/diff_rast/cuda/csrc/bindings.cu
similarity index 100%
rename from csrc/bindings.cu
rename to diff_rast/cuda/csrc/bindings.cu
diff --git a/csrc/bindings.h b/diff_rast/cuda/csrc/bindings.h
similarity index 100%
rename from csrc/bindings.h
rename to diff_rast/cuda/csrc/bindings.h
diff --git a/csrc/config.h b/diff_rast/cuda/csrc/config.h
similarity index 100%
rename from csrc/config.h
rename to diff_rast/cuda/csrc/config.h
diff --git a/csrc/ext.cpp b/diff_rast/cuda/csrc/ext.cpp
similarity index 100%
rename from csrc/ext.cpp
rename to diff_rast/cuda/csrc/ext.cpp
diff --git a/csrc/forward.cu b/diff_rast/cuda/csrc/forward.cu
similarity index 99%
rename from csrc/forward.cu
rename to diff_rast/cuda/csrc/forward.cu
index 26c7d2ad6..aebf60816 100644
--- a/csrc/forward.cu
+++ b/diff_rast/cuda/csrc/forward.cu
@@ -522,6 +522,7 @@ __host__ __device__ float3 project_cov3d_ewa(
 __host__ __device__ void scale_rot_to_cov3d(
     const float3 scale, const float glob_scale, const float4 quat, float *cov3d
 ) {
+    // printf("quat %.2f %.2f %.2f %.2f\n", quat.x, quat.y, quat.z, quat.w);
     glm::mat3 R = quat_to_rotmat(quat);
     // printf("R %.2f %.2f %.2f\n", R[0][0], R[1][1], R[2][2]);
     glm::mat3 S = scale_to_mat(scale, glob_scale);
diff --git a/csrc/forward.cuh b/diff_rast/cuda/csrc/forward.cuh
similarity index 100%
rename from csrc/forward.cuh
rename to diff_rast/cuda/csrc/forward.cuh
diff --git a/csrc/helpers.cuh b/diff_rast/cuda/csrc/helpers.cuh
similarity index 100%
rename from csrc/helpers.cuh
rename to diff_rast/cuda/csrc/helpers.cuh
diff --git a/csrc/rasterize.cu b/diff_rast/cuda/csrc/rasterize.cu
similarity index 100%
rename from csrc/rasterize.cu
rename to diff_rast/cuda/csrc/rasterize.cu
diff --git a/csrc/rasterize.h b/diff_rast/cuda/csrc/rasterize.h
similarity index 100%
rename from csrc/rasterize.h
rename to diff_rast/cuda/csrc/rasterize.h
diff --git a/csrc/serial_backward.cu b/diff_rast/cuda/csrc/serial_backward.cu
similarity index 100%
rename from csrc/serial_backward.cu
rename to diff_rast/cuda/csrc/serial_backward.cu
diff --git a/csrc/serial_backward.cuh b/diff_rast/cuda/csrc/serial_backward.cuh
similarity index 100%
rename from csrc/serial_backward.cuh
rename to diff_rast/cuda/csrc/serial_backward.cuh
diff --git a/csrc/sh.cuh b/diff_rast/cuda/csrc/sh.cuh
similarity index 100%
rename from csrc/sh.cuh
rename to diff_rast/cuda/csrc/sh.cuh
diff --git a/csrc/third_party/glm b/diff_rast/cuda/csrc/third_party/glm
similarity index 100%
rename from csrc/third_party/glm
rename to diff_rast/cuda/csrc/third_party/glm
diff --git a/diff_rast/diff_rast/__init__.py b/diff_rast/diff_rast/__init__.py
deleted file mode 100644
index 5f42e0783..000000000
--- a/diff_rast/diff_rast/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-import torch
\ No newline at end of file
diff --git a/diff_rast/diff_rast/_torch_impl.py b/diff_rast/diff_rast/_torch_impl.py
deleted file mode 100644
index 4de51a47b..000000000
--- a/diff_rast/diff_rast/_torch_impl.py
+++ /dev/null
@@ -1,276 +0,0 @@
-"""Pure PyTorch implementation"""
-
-from jaxtyping import Float
-import torch
-import torch.nn.functional as F
-from torch import Tensor
-
-
-def quat_to_rotmat(quat: Tensor) -> Tensor:
-    assert quat.shape[-1] == 4, quat.shape
-    x, y, z, w = torch.unbind(F.normalize(quat, dim=-1), dim=-1)
-    return torch.stack(
-        [
-            torch.stack(
-                [
-                    1 - 2 * (y**2 + z**2),
-                    2 * (x * y - w * z),
-                    2 * (x * z + w * y),
-                ],
-                dim=-1,
-            ),
-            torch.stack(
-                [
-                    2 * (x * y + w * z),
-                    1 - 2 * (x**2 + z**2),
-                    2 * (y * z - w * x),
-                ],
-                dim=-1,
-            ),
-            torch.stack(
-                [
-                    2 * (x * z - w * y),
-                    2 * (y * z + w * x),
-                    1 - 2 * (x**2 + y**2),
-                ],
-                dim=-1,
-            ),
-        ],
-        dim=-2,
-    )
-
-
-def scale_rot_to_cov3d(
-    scale: Tensor, glob_scale: float, quat: Tensor
-) -> Tensor:
-    assert scale.shape[-1] == 3, scale.shape
-    assert quat.shape[-1] == 4, quat.shape
-    assert scale.shape[:-1] == quat.shape[:-1], (scale.shape, quat.shape)
-    R = quat_to_rotmat(quat)  # (..., 3, 3)
-    M = R * glob_scale * scale[..., None, :]  # (..., 3, 3)
-    # TODO: save upper right because symmetric
-    return M @ M.transpose(-1, -2)  # (..., 3, 3)
-
-
-def project_cov3d_ewa(
-    mean3d: Tensor, cov3d: Tensor, viewmat: Tensor, fx: float, fy: float
-) -> Tensor:
-    assert mean3d.shape[-1] == 3, mean3d.shape
-    assert cov3d.shape[-2:] == (3, 3), cov3d.shape
-    assert viewmat.shape[-2:] == (4, 4), viewmat.shape
-    W = viewmat[..., :3, :3]  # (..., 3, 3)
-    p = viewmat[..., :3, 3]  # (..., 3)
-    t = torch.matmul(W, mean3d[..., None])[..., 0] + p  # (..., 3)
-    rz = 1.0 / t[..., 2]  # (...,)
-    rz2 = rz**2  # (...,)
-    J = torch.stack(
-        [
-            torch.stack(
-                [fx * rz, torch.zeros_like(rz), -fx * t[..., 0] * rz2], dim=-1
-            ),
-            torch.stack(
-                [torch.zeros_like(rz), fy * rz, -fy * t[..., 1] * rz2], dim=-1
-            ),
-        ],
-        dim=-2,
-    )  # (..., 2, 3)
-    T = J @ W  # (..., 2, 3)
-    cov2d = T @ cov3d @ T.transpose(-1, -2)  # (..., 2, 2)
-    # add a little blur along axes and (TODO save upper triangular elements)
-    cov2d[..., 0, 0] = cov2d[..., 0, 0] + 0.1
-    cov2d[..., 1, 1] = cov2d[..., 1, 1] + 0.1
-    return cov2d
-
-
-def compute_cov2d_bounds(cov2d: Tensor, eps=1e-6):
-    det = cov2d[..., 0, 0] * cov2d[..., 1, 1] - cov2d[..., 0, 1] ** 2
-    det = torch.clamp(det, min=eps)
-    conic = torch.stack(
-        [
-            cov2d[..., 1, 1] / det,
-            -cov2d[..., 0, 1] / det,
-            cov2d[..., 0, 0] / det,
-        ],
-        dim=-1,
-    )  # (..., 3)
-    b = (cov2d[..., 0, 0] + cov2d[..., 1, 1]) / 2  # (...,)
-    v1 = b + torch.sqrt(torch.clamp(b**2 - det, min=0.1))  # (...,)
-    v2 = b - torch.sqrt(torch.clamp(b**2 - det, min=0.1))  # (...,)
-    radius = torch.ceil(3.0 * torch.sqrt(torch.max(v1, v2)))  # (...,)
-    return conic, radius, det > eps
-
-
-def ndc2pix(x, W):
-    return 0.5 * ((x + 1.0) * W - 1.0)
-
-
-def project_pix(mat, p, img_size, eps=1e-6):
-    p_hom = F.pad(p, (0, 1), value=1.0)
-    p_hom = torch.einsum("...ij,...j->...i", mat, p_hom)
-    rw = 1.0 / torch.clamp(p_hom[..., 3], min=eps)
-    p_proj = p_hom[..., :3] * rw[..., None]
-    u = ndc2pix(p_proj[..., 0], img_size[0])
-    v = ndc2pix(p_proj[..., 1], img_size[1])
-    return torch.stack([u, v], dim=-1)
-
-
-def clip_near_plane(p, viewmat, thresh=0.1):
-    R = viewmat[..., :3, :3]
-    T = viewmat[..., :3, 3]
-    p_view = torch.matmul(R, p[..., None])[..., 0] + T
-    return p_view, p_view[..., 2] < thresh
-
-
-def get_tile_bbox(pix_center, pix_radius, tile_bounds, BLOCK_X=16, BLOCK_Y=16):
-    tile_size = torch.tensor(
-        [BLOCK_X, BLOCK_Y], dtype=torch.float32, device=pix_center.device
-    )
-    tile_center = pix_center / tile_size
-    tile_radius = pix_radius[..., None] / tile_size
-
-    top_left = (tile_center - tile_radius).to(torch.int32)
-    bottom_right = (tile_center + tile_radius).to(torch.int32) + 1
-    tile_min = torch.stack(
-        [
-            torch.clamp(top_left[..., 0], 0, tile_bounds[0]),
-            torch.clamp(top_left[..., 1], 0, tile_bounds[1]),
-        ],
-        -1,
-    )
-    tile_max = torch.stack(
-        [
-            torch.clamp(bottom_right[..., 0], 0, tile_bounds[0]),
-            torch.clamp(bottom_right[..., 1], 0, tile_bounds[1]),
-        ],
-        -1,
-    )
-    return tile_min, tile_max
-
-
-def project_gaussians_forward(
-    means3d,
-    scales,
-    glob_scale,
-    quats,
-    viewmat,
-    projmat,
-    fx,
-    fy,
-    img_size,
-    tile_bounds,
-):
-    p_view, is_close = clip_near_plane(means3d, viewmat)
-    cov3d = scale_rot_to_cov3d(scales, glob_scale, quats)
-    cov2d = project_cov3d_ewa(means3d, cov3d, viewmat, fx, fy)
-    conic, radius, det_valid = compute_cov2d_bounds(cov2d)
-    center = project_pix(projmat, means3d, img_size)
-    tile_min, tile_max = get_tile_bbox(center, radius, tile_bounds)
-    tile_area = (tile_max[..., 0] - tile_min[..., 0]) * (
-        tile_max[..., 1] - tile_min[..., 1]
-    )
-    mask = (tile_area > 0) & (~is_close) & det_valid
-
-    num_tiles_hit = tile_area
-    depths = p_view[..., 2]
-    radii = radius.to(torch.int32)
-    xys = center
-    conics = conic
-
-    return cov3d, xys, depths, radii, conics, num_tiles_hit, mask
-
-
-def compute_sh_color(viewdirs: Float[Tensor, "*batch 3"], sh_coeffs : Float[Tensor, "*batch D C"]):
-    """
-    :param viewdirs (*, C)
-    :param sh_coeffs (*, D, C) sh coefficients for each color channel
-    return colors (*, C)
-    """
-    *dims, dim_sh, C = sh_coeffs.shape
-    bases = eval_sh_bases(dim_sh, viewdirs)  # (*, dim_sh)
-    return (bases[..., None] * sh_coeffs).sum(dim=-2)
-
-
-"""
-Taken from https://github.com/sxyu/svox2
-"""
-
-SH_C0 = 0.28209479177387814
-SH_C1 = 0.4886025119029199
-SH_C2 = [
-    1.0925484305920792,
-    -1.0925484305920792,
-    0.31539156525252005,
-    -1.0925484305920792,
-    0.5462742152960396
-]
-SH_C3 = [
-    -0.5900435899266435,
-    2.890611442640554,
-    -0.4570457994644658,
-    0.3731763325901154,
-    -0.4570457994644658,
-    1.445305721320277,
-    -0.5900435899266435
-]
-SH_C4 = [
-    2.5033429417967046,
-    -1.7701307697799304,
-    0.9461746957575601,
-    -0.6690465435572892,
-    0.10578554691520431,
-    -0.6690465435572892,
-    0.47308734787878004,
-    -1.7701307697799304,
-    0.6258357354491761,
-]
-
-MAX_SH_BASIS = 10
-
-def eval_sh_bases(basis_dim : int, dirs : torch.Tensor):
-    """
-    Evaluate spherical harmonics bases at unit directions,
-    without taking linear combination.
-    At each point, the final result may the be
-    obtained through simple multiplication.
-
-    :param basis_dim: int SH basis dim. Currently, 1-25 square numbers supported
-    :param dirs: torch.Tensor (..., 3) unit directions
-
-    :return: torch.Tensor (..., basis_dim)
-    """
-    result = torch.empty((*dirs.shape[:-1], basis_dim), dtype=dirs.dtype, device=dirs.device)
-    result[..., 0] = SH_C0
-    if basis_dim > 1:
-        x, y, z = dirs.unbind(-1)
-        result[..., 1] = -SH_C1 * y;
-        result[..., 2] = SH_C1 * z;
-        result[..., 3] = -SH_C1 * x;
-        if basis_dim > 4:
-            xx, yy, zz = x * x, y * y, z * z
-            xy, yz, xz = x * y, y * z, x * z
-            result[..., 4] = SH_C2[0] * xy;
-            result[..., 5] = SH_C2[1] * yz;
-            result[..., 6] = SH_C2[2] * (2.0 * zz - xx - yy);
-            result[..., 7] = SH_C2[3] * xz;
-            result[..., 8] = SH_C2[4] * (xx - yy);
-
-            if basis_dim > 9:
-                result[..., 9] = SH_C3[0] * y * (3 * xx - yy);
-                result[..., 10] = SH_C3[1] * xy * z;
-                result[..., 11] = SH_C3[2] * y * (4 * zz - xx - yy);
-                result[..., 12] = SH_C3[3] * z * (2 * zz - 3 * xx - 3 * yy);
-                result[..., 13] = SH_C3[4] * x * (4 * zz - xx - yy);
-                result[..., 14] = SH_C3[5] * z * (xx - yy);
-                result[..., 15] = SH_C3[6] * x * (xx - 3 * yy);
-
-                if basis_dim > 16:
-                    result[..., 16] = SH_C4[0] * xy * (xx - yy);
-                    result[..., 17] = SH_C4[1] * yz * (3 * xx - yy);
-                    result[..., 18] = SH_C4[2] * xy * (7 * zz - 1);
-                    result[..., 19] = SH_C4[3] * yz * (7 * zz - 3);
-                    result[..., 20] = SH_C4[4] * (zz * (35 * zz - 30) + 3);
-                    result[..., 21] = SH_C4[5] * xz * (7 * zz - 3);
-                    result[..., 22] = SH_C4[6] * (xx - yy) * (7 * zz - 1);
-                    result[..., 23] = SH_C4[7] * xz * (xx - 3 * yy);
-                    result[..., 24] = SH_C4[8] * (xx * (xx - 3 * yy) - yy * (3 * xx - yy));
-    return result
diff --git a/diff_rast/diff_rast/project_gaussians.py b/diff_rast/project_gaussians.py
similarity index 92%
rename from diff_rast/diff_rast/project_gaussians.py
rename to diff_rast/project_gaussians.py
index 47153df6b..88ed6c327 100644
--- a/diff_rast/diff_rast/project_gaussians.py
+++ b/diff_rast/project_gaussians.py
@@ -6,7 +6,7 @@
 from torch import Tensor
 from torch.autograd import Function
 
-import cuda_lib  # make sure to import torch before diff_rast
+import diff_rast.cuda as _C
 
 
 class ProjectGaussians(Function):
@@ -50,7 +50,7 @@ def forward(
             radii,
             conics,
             num_tiles_hit,
-        ) = cuda_lib.project_gaussians_forward(
+        ) = _C.project_gaussians_forward(
             num_points,
             means3d,
             scales,
@@ -100,13 +100,7 @@ def backward(ctx, v_xys, v_depths, v_radii, v_conics, v_num_tiles_hit, v_cov3d):
             conics,
         ) = ctx.saved_tensors
 
-        (
-            v_cov2d,
-            v_cov3d,
-            v_mean3d,
-            v_scale,
-            v_quat,
-        ) = cuda_lib.project_gaussians_backward(
+        (v_cov2d, v_cov3d, v_mean3d, v_scale, v_quat,) = _C.project_gaussians_backward(
             ctx.num_points,
             means3d,
             scales,
diff --git a/diff_rast/diff_rast/rasterize.py b/diff_rast/rasterize.py
similarity index 95%
rename from diff_rast/diff_rast/rasterize.py
rename to diff_rast/rasterize.py
index 536b83368..e219c4d02 100644
--- a/diff_rast/diff_rast/rasterize.py
+++ b/diff_rast/rasterize.py
@@ -7,7 +7,7 @@
 from torch import Tensor
 from torch.autograd import Function
 
-import cuda_lib  # make sure to import torch before diff_rast
+import diff_rast.cuda as _C
 
 
 class RasterizeGaussians(Function):
@@ -59,7 +59,7 @@ def forward(
             gaussian_ids_unsorted,
             isect_ids_sorted,
             isect_ids_unsorted,
-        ) = cuda_lib.rasterize_forward(
+        ) = _C.rasterize_forward(
             xys.contiguous().cuda(),
             depths.contiguous().cuda(),
             radii.contiguous().cuda(),
@@ -105,7 +105,7 @@ def backward(ctx, v_out_img):
             final_idx,
         ) = ctx.saved_tensors
 
-        v_xy, v_conic, v_colors, v_opacity = cuda_lib.rasterize_backward(
+        v_xy, v_conic, v_colors, v_opacity = _C.rasterize_backward(
             img_height,
             img_width,
             gaussian_ids_sorted.contiguous().cuda(),
@@ -132,5 +132,3 @@ def backward(ctx, v_out_img):
             None,  # img_width
             None,  # background
         )
-
-
diff --git a/diff_rast/setup.py b/diff_rast/setup.py
deleted file mode 100644
index 78d204ed8..000000000
--- a/diff_rast/setup.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import os
-
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-CSRC = os.path.abspath(f"{__file__}/../../csrc")
-
-setup(
-    name="diff_rast",
-    description=" Python package for differentiable rasterization of gaussians",
-    keywords="gaussian, splatting, cuda",
-    # package_dir = {'': '.'},
-    packages=['diff_rast'],
-    ext_modules=[
-        CUDAExtension(
-            name="cuda_lib",
-            sources=[
-                f"{CSRC}/ext.cpp",
-                f"{CSRC}/rasterize.cu",
-                f"{CSRC}/bindings.cu",
-                f"{CSRC}/forward.cu",
-                f"{CSRC}/backward.cu",
-            ],
-            extra_compile_args={"nvcc": [f"-I {CSRC}/third_party/glm/"]},
-        ),
-    ],
-    cmdclass={"build_ext": BuildExtension},
-)
diff --git a/diff_rast/diff_rast/sh.py b/diff_rast/sh.py
similarity index 82%
rename from diff_rast/diff_rast/sh.py
rename to diff_rast/sh.py
index f97f60c0c..d70498f9d 100644
--- a/diff_rast/diff_rast/sh.py
+++ b/diff_rast/sh.py
@@ -1,7 +1,6 @@
 """Python bindings for SH"""
 
-import torch
-import cuda_lib
+import diff_rast.cuda as _C
 
 from jaxtyping import Float
 from torch import Tensor
@@ -21,13 +20,14 @@ def num_sh_bases(degree: int):
 
 
 class SphericalHarmonics(Function):
-     """Compute spherical harmonics  
-    
+    """Compute spherical harmonics
+
     Args:
         degree (int): degree of SHs.
         viewdirs (Tensor): viewing directions.
         coeffs (Tensor): harmonic coefficients.
     """
+
     @staticmethod
     def forward(
         ctx,
@@ -39,7 +39,7 @@ def forward(
         assert coeffs.shape[-2] == num_sh_bases(degree)
         ctx.degree = degree
         ctx.save_for_backward(viewdirs)
-        return cuda_lib.compute_sh_forward(num_points, degree, viewdirs, coeffs)
+        return _C.compute_sh_forward(num_points, degree, viewdirs, coeffs)
 
     @staticmethod
     def backward(ctx, v_colors: Float[Tensor, "*batch 3"]):
@@ -49,5 +49,5 @@ def backward(ctx, v_colors: Float[Tensor, "*batch 3"]):
         return (
             None,
             None,
-            cuda_lib.compute_sh_backward(num_points, degree, viewdirs, v_colors),
+            _C.compute_sh_backward(num_points, degree, viewdirs, v_colors),
         )
diff --git a/diff_rast/version.py b/diff_rast/version.py
new file mode 100644
index 000000000..6c8e6b979
--- /dev/null
+++ b/diff_rast/version.py
@@ -0,0 +1 @@
+__version__ = "0.0.0"
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 000000000..41c270bb3
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 000000000..49a09e602
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,4 @@
+furo
+sphinx
+sphinx-copybutton
+sphinx-design
\ No newline at end of file
diff --git a/docs/source/apis/proj.rst b/docs/source/apis/proj.rst
new file mode 100644
index 000000000..b8c08b21b
--- /dev/null
+++ b/docs/source/apis/proj.rst
@@ -0,0 +1,7 @@
+Projection
+===================================
+
+.. currentmodule:: diff_rast
+
+.. autoclass:: ProjectGaussians
+    :members:
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 000000000..7b682372c
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,42 @@
+__version__ = None
+exec(open("../../diff_rast/version.py", "r").read())
+
+# -- Project information
+
+project = "diff_rast"
+copyright = "2023, Vickie"
+author = "Vickie"
+
+release = __version__
+
+# -- General configuration
+
+extensions = [
+    "sphinx.ext.napoleon",
+    "sphinx.ext.duration",
+    "sphinx.ext.doctest",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.intersphinx",
+]
+
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3/", None),
+    "sphinx": ("https://www.sphinx-doc.org/en/master/", None),
+}
+intersphinx_disabled_domains = ["std"]
+
+templates_path = ["_templates"]
+
+# -- Options for HTML output
+html_theme = "furo"
+
+# Ignore >>> when copying code
+copybutton_prompt_text = r">>> |\.\.\. "
+copybutton_prompt_is_regexp = True
+
+# -- Options for EPUB output
+epub_show_urls = "footnote"
+
+# typehints
+autodoc_typehints = "description"
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 000000000..6c2b31bbf
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,13 @@
+DiffRast Documentation
+===================================
+
+
+Links:
+-------------
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: Python API
+
+   apis/*
\ No newline at end of file
diff --git a/tests/compare_reference.py b/examples/compare_reference.py
similarity index 98%
rename from tests/compare_reference.py
rename to examples/compare_reference.py
index ebb548997..d2e777bb9 100644
--- a/tests/compare_reference.py
+++ b/examples/compare_reference.py
@@ -113,7 +113,7 @@ def _init_gaussians(self):
         self.viewmat.requires_grad = False
 
     def forward_ours(self):
-        xys, depths, radii, conics, num_tiles_hit,cov3d = ProjectGaussians.apply(
+        xys, depths, radii, conics, num_tiles_hit, cov3d = ProjectGaussians.apply(
             self.means,
             self.scales,
             self.scale_mod,
diff --git a/tests/ref_trainer.py b/examples/ref_trainer.py
similarity index 99%
rename from tests/ref_trainer.py
rename to examples/ref_trainer.py
index af1aade65..779f3d42c 100644
--- a/tests/ref_trainer.py
+++ b/examples/ref_trainer.py
@@ -27,7 +27,7 @@ def projection_matrix(znear, zfar, fovx, fovy, **kwargs):
             [0.0, 0.0, (f + n) / (f - n), -2.0 * f * n / (f - n)],
             [0.0, 0.0, 1.0, 0.0],
         ],
-        **kwargs
+        **kwargs,
     )
 
 
@@ -164,7 +164,6 @@ def forward(self):
         )  # (C, H, W)
         return out_color.permute(1, 2, 0)
 
-
     def train(self, iterations: int = 1000, lr: float = 0.01, save_imgs: bool = True):
         optimizer = optim.Adam(
             [self.rgbs, self.means, self.scales, self.opacities, self.quats], lr
diff --git a/tests/simple_trainer.py b/examples/simple_trainer.py
similarity index 97%
rename from tests/simple_trainer.py
rename to examples/simple_trainer.py
index 046b0bd33..5dd36fb7d 100644
--- a/tests/simple_trainer.py
+++ b/examples/simple_trainer.py
@@ -44,7 +44,7 @@ def _init_gaussians(self):
         self.scales = torch.empty((self.num_points, 3), device=self.device)
         self.quats = torch.empty((self.num_points, 4), device=self.device)
         self.rgbs = torch.ones((self.num_points, 3), device=self.device)
-        self.opacities = torch.ones((self.num_points,1), device=self.device)
+        self.opacities = torch.ones((self.num_points, 1), device=self.device)
         bd = 2
         for i in range(self.num_points):
             self.means[i] = torch.tensor(
@@ -144,13 +144,16 @@ def train(self, iterations: int = 1000, lr: float = 0.01, save_imgs: bool = True
                 loop=0,
             )
 
+
 def image_path_to_tensor(image_path):
     import torchvision.transforms as transforms
+
     img = Image.open(image_path)
     transform = transforms.ToTensor()
-    img_tensor = transform(img).permute(1,2,0)
+    img_tensor = transform(img).permute(1, 2, 0)
     return img_tensor
 
+
 def main(height: int = 256, width: int = 256) -> None:
     gt_image = torch.ones((height, width, 3)) * 1.0
     # make top left and bottom right red,blue
diff --git a/tests/test_sh.py b/examples/test_sh.py
similarity index 100%
rename from tests/test_sh.py
rename to examples/test_sh.py
diff --git a/ref_rast/ref_rast/__init__.py b/ref_rast/ref_rast/__init__.py
deleted file mode 100644
index f05b919e0..000000000
--- a/ref_rast/ref_rast/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from .rasterize import _RasterizeGaussians, GaussianRasterizationSettings
-
-def rasterize_gaussians(
-    means3D,
-    means2D,
-    sh,
-    colors_precomp,
-    opacities,
-    scales,
-    rotations,
-    cov3Ds_precomp,
-    raster_settings,
-):
-    return _RasterizeGaussians.apply(
-        means3D,
-        means2D,
-        sh,
-        colors_precomp,
-        opacities,
-        scales,
-        rotations,
-        cov3Ds_precomp,
-        raster_settings,
-    )
diff --git a/ref_rast/ref_rast/rasterize.py b/ref_rast/ref_rast/rasterize.py
deleted file mode 100644
index 0892c6739..000000000
--- a/ref_rast/ref_rast/rasterize.py
+++ /dev/null
@@ -1,366 +0,0 @@
-#
-# Copyright (C) 2023, Inria
-# GRAPHDECO research group, https://team.inria.fr/graphdeco
-# All rights reserved.
-#
-# This software is free for non-commercial, research and evaluation use
-# under the terms of the LICENSE.md file.
-#
-# For inquiries contact  george.drettakis@inria.fr
-#
-
-from typing import NamedTuple
-import torch.nn as nn
-import torch
-import ref_rast_cuda
-
-
-def cpu_deep_copy_tuple(input_tuple):
-    copied_tensors = [
-        item.cpu().clone() if isinstance(item, torch.Tensor) else item
-        for item in input_tuple
-    ]
-    return tuple(copied_tensors)
-
-
-def rasterize_gaussians(
-    means3D,
-    means2D,
-    sh,
-    colors_precomp,
-    opacities,
-    scales,
-    rotations,
-    cov3Ds_precomp,
-    raster_settings,
-):
-    return _RasterizeGaussians.apply(
-        means3D,
-        means2D,
-        sh,
-        colors_precomp,
-        opacities,
-        scales,
-        rotations,
-        cov3Ds_precomp,
-        raster_settings,
-    )
-
-
-class _RasterizeGaussians(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        means3D,
-        means2D,
-        sh,
-        colors_precomp,
-        opacities,
-        scales,
-        rotations,
-        cov3Ds_precomp,
-        raster_settings,
-    ):
-        # Restructure arguments the way that the C++ lib expects them
-        args = (
-            raster_settings.bg,
-            means3D,
-            colors_precomp,
-            opacities,
-            scales,
-            rotations,
-            raster_settings.scale_modifier,
-            cov3Ds_precomp,
-            raster_settings.viewmatrix,
-            raster_settings.projmatrix,
-            raster_settings.tanfovx,
-            raster_settings.tanfovy,
-            raster_settings.image_height,
-            raster_settings.image_width,
-            sh,
-            raster_settings.sh_degree,
-            raster_settings.campos,
-            raster_settings.prefiltered,
-            raster_settings.debug,
-        )
-
-        # Invoke C++/CUDA rasterizer
-        if raster_settings.debug:
-            cpu_args = cpu_deep_copy_tuple(
-                args
-            )  # Copy them before they can be corrupted
-            try:
-                (
-                    num_rendered,
-                    color,
-                    radii,
-                    geomBuffer,
-                    binningBuffer,
-                    imgBuffer,
-                ) = ref_rast_cuda.rasterize_gaussians(*args)
-            except Exception as ex:
-                torch.save(cpu_args, "snapshot_fw.dump")
-                print(
-                    "\nAn error occured in forward. Please forward snapshot_fw.dump for debugging."
-                )
-                raise ex
-        else:
-            (
-                num_rendered,
-                color,
-                radii,
-                geomBuffer,
-                binningBuffer,
-                imgBuffer,
-            ) = ref_rast_cuda.rasterize_gaussians(*args)
-
-        # Keep relevant tensors for backward
-        ctx.raster_settings = raster_settings
-        ctx.num_rendered = num_rendered
-        ctx.save_for_backward(
-            colors_precomp,
-            means3D,
-            scales,
-            rotations,
-            cov3Ds_precomp,
-            radii,
-            sh,
-            geomBuffer,
-            binningBuffer,
-            imgBuffer,
-        )
-        return color, radii
-
-    @staticmethod
-    def backward(ctx, grad_out_color, _):
-        # Restore necessary values from context
-        num_rendered = ctx.num_rendered
-        raster_settings = ctx.raster_settings
-        (
-            colors_precomp,
-            means3D,
-            scales,
-            rotations,
-            cov3Ds_precomp,
-            radii,
-            sh,
-            geomBuffer,
-            binningBuffer,
-            imgBuffer,
-        ) = ctx.saved_tensors
-
-        # Restructure args as C++ method expects them
-        args = (
-            raster_settings.bg,
-            means3D,
-            radii,
-            colors_precomp,
-            scales,
-            rotations,
-            raster_settings.scale_modifier,
-            cov3Ds_precomp,
-            raster_settings.viewmatrix,
-            raster_settings.projmatrix,
-            raster_settings.tanfovx,
-            raster_settings.tanfovy,
-            grad_out_color,
-            sh,
-            raster_settings.sh_degree,
-            raster_settings.campos,
-            geomBuffer,
-            num_rendered,
-            binningBuffer,
-            imgBuffer,
-            raster_settings.debug,
-        )
-
-        # Compute gradients for relevant tensors by invoking backward method
-        if raster_settings.debug:
-            cpu_args = cpu_deep_copy_tuple(
-                args
-            )  # Copy them before they can be corrupted
-            try:
-                (
-                    grad_means2D,
-                    grad_colors_precomp,
-                    grad_opacities,
-                    grad_means3D,
-                    grad_cov3Ds_precomp,
-                    grad_sh,
-                    grad_scales,
-                    grad_rotations,
-                ) = ref_rast_cuda.rasterize_gaussians_backward(*args)
-            except Exception as ex:
-                torch.save(cpu_args, "snapshot_bw.dump")
-                print(
-                    "\nAn error occured in backward. Writing snapshot_bw.dump for debugging.\n"
-                )
-                raise ex
-        else:
-            (
-                grad_means2D,
-                grad_colors_precomp,
-                grad_opacities,
-                grad_means3D,
-                grad_cov3Ds_precomp,
-                grad_sh,
-                grad_scales,
-                grad_rotations,
-            ) = ref_rast_cuda.rasterize_gaussians_backward(*args)
-
-        grads = (
-            grad_means3D,
-            grad_means2D,
-            grad_sh,
-            grad_colors_precomp,
-            grad_opacities,
-            grad_scales,
-            grad_rotations,
-            grad_cov3Ds_precomp,
-            None,
-        )
-
-        return grads
-
-
-class GaussianRasterizationSettings(NamedTuple):
-    image_height: int
-    image_width: int
-    tanfovx: float
-    tanfovy: float
-    bg: torch.Tensor
-    scale_modifier: float
-    viewmatrix: torch.Tensor
-    projmatrix: torch.Tensor
-    sh_degree: int
-    campos: torch.Tensor
-    prefiltered: bool
-    debug: bool
-
-
-class GaussianRasterizer(nn.Module):
-    def __init__(self, raster_settings):
-        super().__init__()
-        self.raster_settings = raster_settings
-
-    def markVisible(self, positions):
-        # Mark visible points (based on frustum culling for camera) with a boolean
-        with torch.no_grad():
-            raster_settings = self.raster_settings
-            visible = ref_rast_cuda.mark_visible(
-                positions, raster_settings.viewmatrix, raster_settings.projmatrix
-            )
-
-        return visible
-
-    def forward(
-        self,
-        means3D,
-        means2D,
-        opacities,
-        shs=None,
-        colors_precomp=None,
-        scales=None,
-        rotations=None,
-        cov3D_precomp=None,
-    ):
-        raster_settings = self.raster_settings
-
-        if (shs is None and colors_precomp is None) or (
-            shs is not None and colors_precomp is not None
-        ):
-            raise Exception(
-                "Please provide excatly one of either SHs or precomputed colors!"
-            )
-
-        if ((scales is None or rotations is None) and cov3D_precomp is None) or (
-            (scales is not None or rotations is not None) and cov3D_precomp is not None
-        ):
-            raise Exception(
-                "Please provide exactly one of either scale/rotation pair or precomputed 3D covariance!"
-            )
-
-        if shs is None:
-            shs = torch.Tensor([])
-        if colors_precomp is None:
-            colors_precomp = torch.Tensor([])
-
-        if scales is None:
-            scales = torch.Tensor([])
-        if rotations is None:
-            rotations = torch.Tensor([])
-        if cov3D_precomp is None:
-            cov3D_precomp = torch.Tensor([])
-
-        # Invoke C++/CUDA rasterization routine
-        return rasterize_gaussians(
-            means3D,
-            means2D,
-            shs,
-            colors_precomp,
-            opacities,
-            scales,
-            rotations,
-            cov3D_precomp,
-            raster_settings,
-        )
-
-
-if __name__ == "__main__":
-    import imageio
-
-    device = torch.device("cuda:0")
-    num_points = 256
-    n = 16
-    # means3d = torch.randn((num_points, 3), device=device, requires_grad=True)
-    linspace = torch.linspace(-1, 1, n, device=device)
-    x, y = torch.meshgrid(linspace, linspace)
-    z = torch.linspace(0, 1, num_points, device=device)
-    print(x.shape, y.shape, z.shape)
-    means3d = torch.stack([x.flatten(), y.flatten(), z], dim=-1)
-    print(means3d.shape)
-    means2d = torch.tensor([])
-    sh = torch.Tensor([])
-    opacities = torch.ones((num_points, 1), device=device)
-    colors = torch.rand((num_points, 3), device=device)
-    covs3d = torch.Tensor([])
-    # scales = torch.rand((num_points, 3), device=device)
-    scales = torch.ones((num_points, 3), device=device)
-    quats = torch.randn((num_points, 4), device=device)
-    quats /= torch.linalg.norm(quats, dim=-1, keepdim=True)
-
-    # expects view matrix be column major
-    viewmat = torch.eye(4, device=device)
-    viewmat[3, 2] = 2
-    # projmat = torch.eye(4, device=device)
-    projmat = viewmat
-
-    fx = 3.0
-    fy = 3.0
-    img_height = 256
-    img_width = 256
-    tanfovx = 0.5 * img_width / fx
-    tanfovy = 0.5 * img_height / fy
-    bg = torch.ones(3, device=device)
-    scale_modifier = 1.0
-
-    settings = GaussianRasterizationSettings(
-        img_height,
-        img_width,
-        tanfovx,
-        tanfovy,
-        bg,
-        scale_modifier,
-        viewmat,
-        viewmat,
-        sh_degree=0,
-        campos=torch.zeros(3, device=device),
-        prefiltered=False,
-        debug=True,
-    )
-
-    out_color, radii = rasterize_gaussians(
-        means3d, means2d, sh, colors, opacities, scales, quats, covs3d, settings
-    )
-    img = (255 * out_color.detach().cpu().permute(1, 2, 0)).byte()
-    imageio.imwrite("test_reference_forward.png", img)
diff --git a/ref_rast/setup.py b/ref_rast/setup.py
deleted file mode 100644
index 0269cc2d0..000000000
--- a/ref_rast/setup.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import os
-
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-CSRC = os.path.abspath(f"{__file__}/../../csrc")
-
-setup(
-    name="ref_rast",
-    packages=["ref_rast"],
-    description="reference package for differentiable rasterization of gaussians",
-    keywords="gaussian, splatting, cuda",
-    ext_modules=[
-        CUDAExtension(
-            name="ref_rast_cuda",
-            sources=[
-                f"{CSRC}/reference/cuda_rasterizer/rasterizer_impl.cu",
-                f"{CSRC}/reference/cuda_rasterizer/forward.cu",
-                f"{CSRC}/reference/cuda_rasterizer/backward.cu",
-                f"{CSRC}/reference/rasterize_points.cu",
-                f"{CSRC}/reference/ext.cpp",
-            ],
-            extra_compile_args={"nvcc": [f"-I {CSRC}/third_party/glm/"]},
-        ),
-    ],
-    cmdclass={"build_ext": BuildExtension},
-)
diff --git a/setup.py b/setup.py
new file mode 100644
index 000000000..1d3cf24bb
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,132 @@
+import glob
+import os
+import os.path as osp
+import platform
+import sys
+
+from setuptools import find_packages, setup
+
+__version__ = None
+exec(open("diff_rast/version.py", "r").read())
+
+URL = ""  # TODO
+
+BUILD_NO_CUDA = os.getenv("BUILD_NO_CUDA", "0") == "1"
+WITH_SYMBOLS = os.getenv("WITH_SYMBOLS", "0") == "1"
+
+
+def get_ext():
+    from torch.utils.cpp_extension import BuildExtension
+
+    return BuildExtension.with_options(no_python_abi_suffix=True, use_ninja=False)
+
+
+def get_extensions():
+    import torch
+    from torch.__config__ import parallel_info
+    from torch.utils.cpp_extension import CUDAExtension
+
+    extensions_dir = osp.join("diff_rast", "cuda", "csrc")
+    sources = glob.glob(osp.join(extensions_dir, "*.cu")) + glob.glob(
+        osp.join(extensions_dir, "*.cpp")
+    )
+    # sources = [
+    #     osp.join(extensions_dir, "ext.cpp"),
+    #     osp.join(extensions_dir, "rasterize.cu"),
+    #     osp.join(extensions_dir, "bindings.cu"),
+    #     osp.join(extensions_dir, "forward.cu"),
+    #     osp.join(extensions_dir, "backward.cu"),
+    # ]
+    # remove generated 'hip' files, in case of rebuilds
+    sources = [path for path in sources if "hip" not in path]
+
+    undef_macros = []
+    define_macros = []
+
+    if sys.platform == "win32":
+        define_macros += [("diff_rast_EXPORTS", None)]
+
+    extra_compile_args = {"cxx": ["-O3"]}
+    if not os.name == "nt":  # Not on Windows:
+        extra_compile_args["cxx"] += ["-Wno-sign-compare"]
+    extra_link_args = [] if WITH_SYMBOLS else ["-s"]
+
+    info = parallel_info()
+    if (
+        "backend: OpenMP" in info
+        and "OpenMP not found" not in info
+        and sys.platform != "darwin"
+    ):
+        extra_compile_args["cxx"] += ["-DAT_PARALLEL_OPENMP"]
+        if sys.platform == "win32":
+            extra_compile_args["cxx"] += ["/openmp"]
+        else:
+            extra_compile_args["cxx"] += ["-fopenmp"]
+    else:
+        print("Compiling without OpenMP...")
+
+    # Compile for mac arm64
+    if sys.platform == "darwin" and platform.machine() == "arm64":
+        extra_compile_args["cxx"] += ["-arch", "arm64"]
+        extra_link_args += ["-arch", "arm64"]
+
+    nvcc_flags = os.getenv("NVCC_FLAGS", "")
+    nvcc_flags = [] if nvcc_flags == "" else nvcc_flags.split(" ")
+    nvcc_flags += ["-O3"]
+    if torch.version.hip:
+        # USE_ROCM was added to later versions of PyTorch.
+        # Define here to support older PyTorch versions as well:
+        define_macros += [("USE_ROCM", None)]
+        undef_macros += ["__HIP_NO_HALF_CONVERSIONS__"]
+    else:
+        nvcc_flags += ["--expt-relaxed-constexpr"]
+    extra_compile_args["nvcc"] = nvcc_flags
+
+    extension = CUDAExtension(
+        f"diff_rast.csrc",
+        sources,
+        include_dirs=[osp.join(extensions_dir, "third_party", "glm")],
+        define_macros=define_macros,
+        undef_macros=undef_macros,
+        extra_compile_args=extra_compile_args,
+        extra_link_args=extra_link_args,
+    )
+
+    return [extension]
+
+
+setup(
+    name="diff_rast",
+    version=__version__,
+    description=" Python package for differentiable rasterization of gaussians",
+    keywords="gaussian, splatting, cuda",
+    url=URL,
+    download_url=f"{URL}/archive/{__version__}.tar.gz",
+    python_requires=">=3.7",
+    install_requires=[
+        "jaxtyping",
+        "rich>=12",
+        "torch",
+        "typing_extensions; python_version<'3.8'",
+    ],
+    extras_require={
+        # dev dependencies. Install them by `pip install diff_rast[dev]`
+        "dev": [
+            "black[jupyter]==22.3.0",
+            "isort==5.10.1",
+            "pylint==2.13.4",
+            "pytest==7.1.2",
+            "pytest-xdist==2.5.0",
+            "typeguard>=2.13.3",
+            "pyyaml==6.0",
+            "build",
+            "twine",
+            "ninja",
+        ],
+    },
+    ext_modules=get_extensions() if not BUILD_NO_CUDA else [],
+    cmdclass={"build_ext": get_ext()} if not BUILD_NO_CUDA else {},
+    packages=find_packages(),
+    # https://github.com/pypa/setuptools/issues/1461#issuecomment-954725244
+    include_package_data=True,
+)
diff --git a/tests/test_bindings_forward.py b/tests/test_bindings_forward.py
deleted file mode 100644
index d3372ab1a..000000000
--- a/tests/test_bindings_forward.py
+++ /dev/null
@@ -1,109 +0,0 @@
-"""Test against ref bindings
-
-Make sure you have the ref bindings installed:
-    - install ref bindings: cd ref_rast && pip install -e .
-"""
-
-
-import torch
-import imageio
-
-from diff_rast.rasterize import RasterizeGaussians
-from diff_rast.project_gaussians import ProjectGaussians
-
-from ref_rast import GaussianRasterizationSettings, rasterize_gaussians
-
-def test_bindings_forward(save_img=False):
-    means3d, sh, opacities, colors, covs3d, scales, quats, viewmat, projmat = _init_gaussians()
-
-    out_color = _run_ref(
-        means3d.clone(),
-        sh.clone(),
-        opacities.clone(),
-        colors.clone(),
-        covs3d.clone(),
-        scales.clone(),
-        quats.clone(),
-        viewmat.clone(),
-        projmat.clone(),
-    )
-    ref_img = (255 * out_color.detach().cpu()).byte()
-    imageio.imwrite("test_reference_forward.png", ref_img)
-
-    color = _run_diff_rast(means3d, colors, opacities, scales, quats, viewmat, projmat)
-    img = (255 * color.detach().cpu()).byte()
-    imageio.imwrite("test_diff_forward.png", img)
-
-    torch.testing.assert_close(color, out_color)
-
-
-def _run_ref(means3D, sh, opacities, colors, covs3d, scales, quats, viewmat, projmat):
-    settings = GaussianRasterizationSettings(
-        img_height,
-        img_width,
-        tanfovx,
-        tanfovy,
-        bg,
-        scale_modifier,
-        viewmat,
-        projmat,
-        sh_degree=0,
-        campos=viewmat[:3, 3],
-        prefiltered=False,
-        debug=True,
-    )
-
-    out_color, radii = rasterize_gaussians(
-        means3D.clone(), None, sh, colors.clone(), opacities.clone(), scales.clone(), quats.clone(), covs3d, settings
-    )
-
-    return out_color.permute(1, 2, 0)
-
-
-def _run_diff_rast(means, rgbs, opacities, scales, quats, viewmat, projmat):
-    xys, depths, radii, conics, num_tiles_hit = ProjectGaussians.apply(
-        means, scales, scale_modifier, quats, viewmat, projmat, fx, fy, img_height, img_width, TILE_BOUNDS
-    )
-    out_img = RasterizeGaussians.apply(xys, depths, radii, conics, num_tiles_hit, rgbs, opacities, img_height, img_width)
-
-    return out_img
-
-
-def _init_gaussians():
-    means3d = torch.randn((num_points, 3), device=device)
-    sh = torch.Tensor([]).to(device)
-    opacities = torch.ones((num_points, 1), device=device) * 0.9
-    colors = torch.rand((num_points, 3), device=device)
-    covs3d = torch.Tensor([]).to(device)
-    scales = torch.rand((num_points, 3), device=device)
-    quats = torch.randn((num_points, 4), device=device)
-    quats /= torch.linalg.norm(quats, dim=-1, keepdim=True)
-
-    viewmat = torch.eye(4, device=device)
-
-    projmat = viewmat
-
-    means3d.requires_grad = True
-    scales.requires_grad = True
-    quats.requires_grad = True
-    colors.requires_grad = True
-    opacities.requires_grad = True
-
-    return means3d, sh, opacities, colors, covs3d, scales, quats, viewmat, projmat
-
-
-if __name__ == "__main__":
-    device = torch.device("cuda:0")
-    num_points = 100
-
-    fx = 3.0
-    fy = 3.0
-    img_height = 256*2
-    img_width = 256
-    tanfovx = 0.5 * img_width / fx
-    tanfovy = 0.5 * img_height / fy
-    bg = torch.ones(3, device=device)
-    scale_modifier = 1.0
-    TILE_BOUNDS = (img_width + 16 - 1) // 16, (img_height + 16 - 1) // 16, 1
-
-    test_bindings_forward(save_img=True)
diff --git a/tests/test_proj.py b/tests/test_proj.py
deleted file mode 100644
index 683eab68f..000000000
--- a/tests/test_proj.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import pytest
-import torch
-
-device = torch.device("cuda:0")
-
-
-@pytest.mark.skipif(not torch.cuda.is_available, reason="No CUDA device")
-def test_project_gaussians_forward():
-    from diff_rast import _torch_impl
-    import cuda_lib
-
-    torch.manual_seed(42)
-
-    num_points = 1_000_000
-    means3d = torch.randn((num_points, 3), device=device, requires_grad=True)
-    scales = torch.randn((num_points, 3), device=device)
-    glob_scale = 0.3
-    quats = torch.randn((num_points, 4), device=device)
-    quats /= torch.linalg.norm(quats, dim=-1, keepdim=True)
-    viewmat = torch.eye(4, device=device)
-    projmat = torch.eye(4, device=device)
-    fx, fy = 3.0, 3.0
-    H, W = 512, 512
-
-    BLOCK_X, BLOCK_Y = 16, 16
-    tile_bounds = (W + BLOCK_X - 1) // BLOCK_X, (H + BLOCK_Y - 1) // BLOCK_Y, 1
-
-    (
-        cov3d,
-        xys,
-        depths,
-        radii,
-        conics,
-        num_tiles_hit,
-    ) = cuda_lib.project_gaussians_forward(
-        num_points,
-        means3d,
-        scales,
-        glob_scale,
-        quats,
-        viewmat,
-        projmat,
-        fx,
-        fy,
-        (H, W),  # TODO(ruilong): Is this HW or WH?
-        tile_bounds,
-    )
-
-    (
-        _cov3d,
-        _xys,
-        _depths,
-        _radii,
-        _conics,
-        _num_tiles_hit,
-        _masks,
-    ) = _torch_impl.project_gaussians_forward(
-        means3d,
-        scales,
-        glob_scale,
-        quats,
-        viewmat,
-        projmat,
-        fx,
-        fy,
-        (H, W),
-        tile_bounds,
-    )
-
-    torch.testing.assert_close(
-        cov3d[_masks],
-        _cov3d.view(-1, 9)[_masks][:, [0, 1, 2, 4, 5, 8]],
-        atol=1e-5,
-        rtol=1e-5,
-    )
-    torch.testing.assert_close(xys[_masks], _xys[_masks])
-    torch.testing.assert_close(depths[_masks], _depths[_masks])
-    torch.testing.assert_close(radii[_masks], _radii[_masks])
-    torch.testing.assert_close(conics[_masks], _conics[_masks])
-    torch.testing.assert_close(num_tiles_hit[_masks], _num_tiles_hit[_masks])
-
-
-
-if __name__ == "__main__":
-    test_project_gaussians_forward()