From 6b5941848eb600d348b0f6749a2923dd8abc2c40 Mon Sep 17 00:00:00 2001
From: Mehdi Ataei <ataei8@gmail.com>
Date: Tue, 19 Mar 2024 16:12:38 -0400
Subject: [PATCH 1/4] Update README.md

---
 README.md | 43 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6bb7727..91f5cdc 100644
--- a/README.md
+++ b/README.md
@@ -153,4 +153,45 @@ git clone https://github.com/Autodesk/XLB
 cd XLB
 export PYTHONPATH=.
 python3 examples/CFD/cavity2d.py
-```
\ No newline at end of file
+```
+## Roadmap
+
+### Work in Progress (WIP)
+*Note: Some of the work-in-progress features can be found in the branches of the XLB repository. For contributions to these features, please reach out.*
+
+🚀 **Warp Backend:** Achieving state-of-the-art performance by leveraging the [Warp](https://github.com/NVIDIA/warp) framework in combination with JAX.
+
+🌐 **Grid Refinement:** Implementing adaptive mesh refinement techniques for enhanced simulation accuracy.
+
+⚡ **Multi-GPU Acceleration using [Neon](https://github.com/Autodesk/Neon) + Warp:** Using Neon's data structure for better overlapping computation and communication.
+
+💾 **Out-of-Core Computations:** Enabling simulations that exceed available GPU memory, suitable for CPU+GPU coherent memory models such as NVIDIA's Grace Superchips.
+
+🗜️ **GPU Accelerated Lossless Compression and Decompression**: Implementing high-performance lossless compression and decompression techniques for larger-scale simulations and improved performance.
+
+🌡️ **Fluid-Thermal Simulation Capabilities:** Incorporating heat transfer and thermal effects into fluid simulations.
+
+🎯 **Adjoint-based Shape and Topology Optimization:** Implementing gradient-based optimization techniques for design optimization.
+
+🧠 **Machine Learning Accelerated Simulations:** Leveraging machine learning to speed up simulations and improve accuracy.
+
+📉 **Reduced Order Modeling using Machine Learning:** Developing data-driven reduced-order models for efficient simulations.
+
+
+### Wishlist
+*Contributions to these features are welcome. Please submit PRs for the Wishlist items.*
+
+🌊 **Free Surface Flows:** Simulating flows with free surfaces, such as water waves and droplets.
+
+📡 **Electromagnetic Wave Propagation:** Simulating the propagation of electromagnetic waves.
+
+🛩️ **Supersonic Flows:** Simulating supersonic flows based on the techniques described in [this paper](https://royalsocietypublishing.org/doi/10.1098/rsta.2019.0559).
+
+🤝 **Fluid-Solid Interaction (Using Immersed Boundary Method):** Modeling the interaction between fluids and solid objects.
+
+🧩 **Multiphase Flow Simulation:** Simulating flows with multiple immiscible fluids.
+
+🪨 **Particle Flows and Discrete Element Method:** Incorporating particle-based methods for granular and particulate flows.
+
+🔧 **Better Geometry Processing Pipelines:** Improving the handling and preprocessing of complex geometries for simulations.
+

From 574dd7ff414a9ceb389ef5177ddc999226604b89 Mon Sep 17 00:00:00 2001
From: Mehdi Ataei <ataei8@gmail.com>
Date: Tue, 19 Mar 2024 16:20:42 -0400
Subject: [PATCH 2/4] Update README.md

---
 README.md | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 91f5cdc..aad6134 100644
--- a/README.md
+++ b/README.md
@@ -159,39 +159,41 @@ python3 examples/CFD/cavity2d.py
 ### Work in Progress (WIP)
 *Note: Some of the work-in-progress features can be found in the branches of the XLB repository. For contributions to these features, please reach out.*
 
-🚀 **Warp Backend:** Achieving state-of-the-art performance by leveraging the [Warp](https://github.com/NVIDIA/warp) framework in combination with JAX.
+- 🚀 **Warp Backend:** Achieving state-of-the-art performance by leveraging the [Warp](https://github.com/NVIDIA/warp) framework in combination with JAX.
 
-🌐 **Grid Refinement:** Implementing adaptive mesh refinement techniques for enhanced simulation accuracy.
+ - 🌐 **Grid Refinement:** Implementing adaptive mesh refinement techniques for enhanced simulation accuracy.
 
-⚡ **Multi-GPU Acceleration using [Neon](https://github.com/Autodesk/Neon) + Warp:** Using Neon's data structure for better overlapping computation and communication.
+- ⚡ **Multi-GPU Acceleration using [Neon](https://github.com/Autodesk/Neon) + Warp:** Using Neon's data structure for improved scaling.
 
-💾 **Out-of-Core Computations:** Enabling simulations that exceed available GPU memory, suitable for CPU+GPU coherent memory models such as NVIDIA's Grace Superchips.
+- 💾 **Out-of-Core Computations:** Enabling simulations that exceed available GPU memory, suitable for CPU+GPU coherent memory models such as NVIDIA's Grace Superchips.
 
-🗜️ **GPU Accelerated Lossless Compression and Decompression**: Implementing high-performance lossless compression and decompression techniques for larger-scale simulations and improved performance.
+- 🗜️ **GPU Accelerated Lossless Compression and Decompression**: Implementing high-performance lossless compression and decompression techniques for larger-scale simulations and improved performance.
 
-🌡️ **Fluid-Thermal Simulation Capabilities:** Incorporating heat transfer and thermal effects into fluid simulations.
+- 🌡️ **Fluid-Thermal Simulation Capabilities:** Incorporating heat transfer and thermal effects into fluid simulations.
 
-🎯 **Adjoint-based Shape and Topology Optimization:** Implementing gradient-based optimization techniques for design optimization.
+- 🎯 **Adjoint-based Shape and Topology Optimization:** Implementing gradient-based optimization techniques for design optimization.
 
-🧠 **Machine Learning Accelerated Simulations:** Leveraging machine learning to speed up simulations and improve accuracy.
+- 🧠 **Machine Learning Accelerated Simulations:** Leveraging machine learning to speed up simulations and improve accuracy.
 
-📉 **Reduced Order Modeling using Machine Learning:** Developing data-driven reduced-order models for efficient simulations.
+- 📉 **Reduced Order Modeling using Machine Learning:** Developing data-driven reduced-order models for efficient and accurate simulations.
 
 
 ### Wishlist
 *Contributions to these features are welcome. Please submit PRs for the Wishlist items.*
 
-🌊 **Free Surface Flows:** Simulating flows with free surfaces, such as water waves and droplets.
+- 🌊 **Free Surface Flows:** Simulating flows with free surfaces, such as water waves and droplets.
 
-📡 **Electromagnetic Wave Propagation:** Simulating the propagation of electromagnetic waves.
+- 📡 **Electromagnetic Wave Propagation:** Simulating the propagation of electromagnetic waves.
 
-🛩️ **Supersonic Flows:** Simulating supersonic flows based on the techniques described in [this paper](https://royalsocietypublishing.org/doi/10.1098/rsta.2019.0559).
+- 🛩️ **Supersonic Flows:** Simulating supersonic flows.
 
-🤝 **Fluid-Solid Interaction (Using Immersed Boundary Method):** Modeling the interaction between fluids and solid objects.
+- 🌊🧱 **Fluid-Solid Interaction:** Modeling the interaction between fluids and solid objects.
 
-🧩 **Multiphase Flow Simulation:** Simulating flows with multiple immiscible fluids.
+- 🧩 **Multiphase Flow Simulation:** Simulating flows with multiple immiscible fluids.
 
-🪨 **Particle Flows and Discrete Element Method:** Incorporating particle-based methods for granular and particulate flows.
+- 🔥 **Combustion:** Simulating combustion processes and reactive flows.
 
-🔧 **Better Geometry Processing Pipelines:** Improving the handling and preprocessing of complex geometries for simulations.
+- 🪨 **Particle Flows and Discrete Element Method:** Incorporating particle-based methods for granular and particulate flows.
+
+- 🔧 **Better Geometry Processing Pipelines:** Improving the handling and preprocessing of complex geometries for simulations.
 

From e351e49a07b3695544985c75d90fd690e9999a29 Mon Sep 17 00:00:00 2001
From: Mehdi Ataei <ataei8@gmail.com>
Date: Tue, 4 Jun 2024 08:48:21 +0100
Subject: [PATCH 3/4] Updated paper link/citation to CPC

---
 README.md | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index aad6134..dcb67ee 100644
--- a/README.md
+++ b/README.md
@@ -10,18 +10,21 @@ XLB is a fully differentiable 2D/3D Lattice Boltzmann Method (LBM) library that
 
 ## Accompanying Paper
 
-Please refer to the [accompanying paper](https://arxiv.org/abs/2311.16080) for benchmarks, validation, and more details about the library.
+Please refer to the [accompanying paper](https://doi.org/10.1016/j.cpc.2024.109187) for benchmarks, validation, and more details about the library.
 
 ## Citing XLB
 
 If you use XLB in your research, please cite the following paper:
 
 ```
-@article{ataei2023xlb,
-      title={{XLB}: A Differentiable Massively Parallel Lattice Boltzmann Library in Python}, 
-      author={Ataei, Mohammadmehdi and Salehipour, Hesam},
-      journal={arXiv preprint arXiv:2311.16080},
-      year={2023},
+@article{ataei2024xlb,
+  title={{XLB}: A differentiable massively parallel lattice {Boltzmann} library in {Python}},
+  author={Ataei, Mohammadmehdi and Salehipour, Hesam},
+  journal={Computer Physics Communications},
+  volume={300},
+  pages={109187},
+  year={2024},
+  publisher={Elsevier}
 }
 ```
 

From 9496f6fb80bbccab2541cd5092889773908df23c Mon Sep 17 00:00:00 2001
From: Mehdi Ataei <mehdi.ataei@autodesk.com>
Date: Tue, 24 Sep 2024 09:38:02 -0400
Subject: [PATCH 4/4] Fixed mixed-precision for Warp

---
 examples/cfd/flow_past_sphere_3d.py           |  2 +-
 examples/cfd/lid_driven_cavity_2d.py          |  2 +-
 examples/cfd/turbulent_channel_3d.py          |  2 +-
 examples/cfd/windtunnel_3d.py                 |  2 +-
 examples/performance/mlups_3d.py              |  2 +-
 xlb/helper/initializers.py                    |  6 +-
 .../boundary_condition/bc_do_nothing.py       |  4 +-
 .../boundary_condition/bc_equilibrium.py      |  4 +-
 .../bc_extrapolation_outflow.py               |  4 +-
 .../bc_fullway_bounce_back.py                 |  4 +-
 .../bc_halfway_bounce_back.py                 |  4 +-
 .../boundary_condition/bc_regularized.py      |  4 +-
 xlb/operator/boundary_condition/bc_zouhe.py   |  4 +-
 .../boundary_condition/boundary_condition.py  |  8 +--
 xlb/operator/collision/bgk.py                 |  4 +-
 xlb/operator/collision/kbc.py                 |  4 +-
 .../equilibrium/quadratic_equilibrium.py      |  4 +-
 xlb/operator/force/exact_difference_force.py  |  4 +-
 xlb/operator/macroscopic/first_moment.py      |  4 +-
 xlb/operator/macroscopic/macroscopic.py       |  8 +--
 xlb/operator/macroscopic/second_moment.py     |  4 +-
 xlb/operator/stepper/nse_stepper.py           | 57 +++++++++----------
 xlb/operator/stream/stream.py                 |  9 +--
 xlb/precision_policy.py                       | 11 +---
 24 files changed, 74 insertions(+), 87 deletions(-)

diff --git a/examples/cfd/flow_past_sphere_3d.py b/examples/cfd/flow_past_sphere_3d.py
index 8c99dff..2487919 100644
--- a/examples/cfd/flow_past_sphere_3d.py
+++ b/examples/cfd/flow_past_sphere_3d.py
@@ -94,7 +94,7 @@ def setup_boundary_masker(self):
         self.bc_mask, self.missing_mask = indices_boundary_masker(self.boundary_conditions, self.bc_mask, self.missing_mask, (0, 0, 0))
 
     def initialize_fields(self):
-        self.f_0 = initialize_eq(self.f_0, self.grid, self.velocity_set, self.backend)
+        self.f_0 = initialize_eq(self.f_0, self.grid, self.velocity_set, self.precision_policy, self.backend)
 
     def setup_stepper(self, omega):
         self.stepper = IncompressibleNavierStokesStepper(omega, boundary_conditions=self.boundary_conditions, collision_type="BGK")
diff --git a/examples/cfd/lid_driven_cavity_2d.py b/examples/cfd/lid_driven_cavity_2d.py
index c67ce8e..f94e209 100644
--- a/examples/cfd/lid_driven_cavity_2d.py
+++ b/examples/cfd/lid_driven_cavity_2d.py
@@ -61,7 +61,7 @@ def setup_boundary_masker(self):
         self.bc_mask, self.missing_mask = indices_boundary_masker(self.boundary_conditions, self.bc_mask, self.missing_mask)
 
     def initialize_fields(self):
-        self.f_0 = initialize_eq(self.f_0, self.grid, self.velocity_set, self.backend)
+        self.f_0 = initialize_eq(self.f_0, self.grid, self.velocity_set, self.precision_policy, self.backend)
 
     def setup_stepper(self, omega):
         self.stepper = IncompressibleNavierStokesStepper(omega, boundary_conditions=self.boundary_conditions)
diff --git a/examples/cfd/turbulent_channel_3d.py b/examples/cfd/turbulent_channel_3d.py
index ea8cad7..65b56bf 100644
--- a/examples/cfd/turbulent_channel_3d.py
+++ b/examples/cfd/turbulent_channel_3d.py
@@ -102,7 +102,7 @@ def initialize_fields(self):
             u_init = jnp.full(shape=shape, fill_value=1e-2 * u_init)
         else:
             u_init = wp.array(1e-2 * u_init, dtype=self.precision_policy.compute_precision.wp_dtype)
-        self.f_0 = initialize_eq(self.f_0, self.grid, self.velocity_set, self.backend, u=u_init)
+        self.f_0 = initialize_eq(self.f_0, self.grid, self.velocity_set, self.precision_policy, self.backend, u=u_init)
 
     def setup_stepper(self):
         force = self.get_force()
diff --git a/examples/cfd/windtunnel_3d.py b/examples/cfd/windtunnel_3d.py
index 92c94aa..077ae98 100644
--- a/examples/cfd/windtunnel_3d.py
+++ b/examples/cfd/windtunnel_3d.py
@@ -122,7 +122,7 @@ def setup_boundary_masker(self):
         self.bc_mask, self.missing_mask = mesh_boundary_masker(bc_mesh, origin, spacing, self.bc_mask, self.missing_mask)
 
     def initialize_fields(self):
-        self.f_0 = initialize_eq(self.f_0, self.grid, self.velocity_set, self.backend)
+        self.f_0 = initialize_eq(self.f_0, self.grid, self.velocity_set, self.precision_policy, self.backend)
 
     def setup_stepper(self):
         self.stepper = IncompressibleNavierStokesStepper(self.omega, boundary_conditions=self.boundary_conditions, collision_type="KBC")
diff --git a/examples/performance/mlups_3d.py b/examples/performance/mlups_3d.py
index 32c3d6d..907c1f2 100644
--- a/examples/performance/mlups_3d.py
+++ b/examples/performance/mlups_3d.py
@@ -100,7 +100,7 @@ def main():
     backend, precision_policy = setup_simulation(args)
     velocity_set = xlb.velocity_set.D3Q19(precision_policy=precision_policy, backend=backend)
     grid, f_0, f_1, missing_mask, bc_mask = create_grid_and_fields(args.cube_edge)
-    f_0 = initialize_eq(f_0, grid, velocity_set, backend)
+    f_0 = initialize_eq(f_0, grid, velocity_set, precision_policy, backend)
 
     elapsed_time = run(f_0, f_1, backend, precision_policy, grid, bc_mask, missing_mask, args.num_steps)
     mlups = calculate_mlups(args.cube_edge, args.num_steps, elapsed_time)
diff --git a/xlb/helper/initializers.py b/xlb/helper/initializers.py
index c8439d9..ccb4a82 100644
--- a/xlb/helper/initializers.py
+++ b/xlb/helper/initializers.py
@@ -2,11 +2,11 @@
 from xlb.operator.equilibrium import QuadraticEquilibrium
 
 
-def initialize_eq(f, grid, velocity_set, backend, rho=None, u=None):
+def initialize_eq(f, grid, velocity_set, precision_policy, backend, rho=None, u=None):
     if rho is None:
-        rho = grid.create_field(cardinality=1, fill_value=1.0)
+        rho = grid.create_field(cardinality=1, fill_value=1.0, dtype=precision_policy.compute_precision)
     if u is None:
-        u = grid.create_field(cardinality=velocity_set.d, fill_value=0.0)
+        u = grid.create_field(cardinality=velocity_set.d, fill_value=0.0, dtype=precision_policy.compute_precision)
     equilibrium = QuadraticEquilibrium()
 
     if backend == ComputeBackend.JAX:
diff --git a/xlb/operator/boundary_condition/bc_do_nothing.py b/xlb/operator/boundary_condition/bc_do_nothing.py
index dcdc8fd..0ddbcfc 100644
--- a/xlb/operator/boundary_condition/bc_do_nothing.py
+++ b/xlb/operator/boundary_condition/bc_do_nothing.py
@@ -86,7 +86,7 @@ def kernel2d(
 
             # Write the result
             for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = _f[l]
+                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
 
         # Construct the warp kernel
         @wp.kernel
@@ -112,7 +112,7 @@ def kernel3d(
 
             # Write the result
             for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = _f[l]
+                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
 
         kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
 
diff --git a/xlb/operator/boundary_condition/bc_equilibrium.py b/xlb/operator/boundary_condition/bc_equilibrium.py
index 716fd8e..6d4e3ed 100644
--- a/xlb/operator/boundary_condition/bc_equilibrium.py
+++ b/xlb/operator/boundary_condition/bc_equilibrium.py
@@ -111,7 +111,7 @@ def kernel2d(
 
             # Write the result
             for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = _f[l]
+                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
 
         # Construct the warp kernel
         @wp.kernel
@@ -137,7 +137,7 @@ def kernel3d(
 
             # Write the result
             for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = _f[l]
+                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
 
         kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
 
diff --git a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
index 87c6850..55a5851 100644
--- a/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
+++ b/xlb/operator/boundary_condition/bc_extrapolation_outflow.py
@@ -229,7 +229,7 @@ def kernel2d(
 
             # Write the distribution function
             for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = _f[l]
+                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
 
         # Construct the warp kernel
         @wp.kernel
@@ -270,7 +270,7 @@ def kernel3d(
 
             # Write the distribution function
             for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = _f[l]
+                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
 
         kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
 
diff --git a/xlb/operator/boundary_condition/bc_fullway_bounce_back.py b/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
index 85bffb1..57d29fd 100644
--- a/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_fullway_bounce_back.py
@@ -95,7 +95,7 @@ def kernel2d(
 
             # Write the result to the output
             for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = _f[l]
+                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
 
         # Construct the warp kernel
         @wp.kernel
@@ -121,7 +121,7 @@ def kernel3d(
 
             # Write the result to the output
             for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = _f[l]
+                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
 
         kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
 
diff --git a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
index c2483c9..e723570 100644
--- a/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
+++ b/xlb/operator/boundary_condition/bc_halfway_bounce_back.py
@@ -110,7 +110,7 @@ def kernel2d(
 
             # Write the distribution function
             for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = _f[l]
+                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
 
         # Construct the warp kernel
         @wp.kernel
@@ -136,7 +136,7 @@ def kernel3d(
 
             # Write the distribution function
             for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = _f[l]
+                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
 
         kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
 
diff --git a/xlb/operator/boundary_condition/bc_regularized.py b/xlb/operator/boundary_condition/bc_regularized.py
index 7879137..a42b695 100644
--- a/xlb/operator/boundary_condition/bc_regularized.py
+++ b/xlb/operator/boundary_condition/bc_regularized.py
@@ -344,7 +344,7 @@ def kernel2d(
 
             # Write the distribution function
             for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = _f[l]
+                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
 
         # Construct the warp kernel
         @wp.kernel
@@ -370,7 +370,7 @@ def kernel3d(
 
             # Write the distribution function
             for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = _f[l]
+                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
 
         kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
         if self.velocity_set.d == 3 and self.bc_type == "velocity":
diff --git a/xlb/operator/boundary_condition/bc_zouhe.py b/xlb/operator/boundary_condition/bc_zouhe.py
index 0bf68b8..4e9fe29 100644
--- a/xlb/operator/boundary_condition/bc_zouhe.py
+++ b/xlb/operator/boundary_condition/bc_zouhe.py
@@ -352,7 +352,7 @@ def kernel2d(
 
             # Write the distribution function
             for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1]] = _f[l]
+                f_post[l, index[0], index[1]] = self.store_dtype(_f[l])
 
         # Construct the warp kernel
         @wp.kernel
@@ -378,7 +378,7 @@ def kernel3d(
 
             # Write the distribution function
             for l in range(self.velocity_set.q):
-                f_post[l, index[0], index[1], index[2]] = _f[l]
+                f_post[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
 
         kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
         if self.velocity_set.d == 3 and self.bc_type == "velocity":
diff --git a/xlb/operator/boundary_condition/boundary_condition.py b/xlb/operator/boundary_condition/boundary_condition.py
index 8d47929..9f6ef5d 100644
--- a/xlb/operator/boundary_condition/boundary_condition.py
+++ b/xlb/operator/boundary_condition/boundary_condition.py
@@ -81,8 +81,8 @@ def _get_thread_data_2d(
             _missing_mask = _missing_mask_vec()
             for l in range(self.velocity_set.q):
                 # q-sized vector of populations
-                _f_pre[l] = f_pre[l, index[0], index[1]]
-                _f_post[l] = f_post[l, index[0], index[1]]
+                _f_pre[l] = self.compute_dtype(f_pre[l, index[0], index[1]])
+                _f_post[l] = self.compute_dtype(f_post[l, index[0], index[1]])
 
                 # TODO fix vec bool
                 if missing_mask[l, index[0], index[1]]:
@@ -106,8 +106,8 @@ def _get_thread_data_3d(
             _missing_mask = _missing_mask_vec()
             for l in range(self.velocity_set.q):
                 # q-sized vector of populations
-                _f_pre[l] = f_pre[l, index[0], index[1], index[2]]
-                _f_post[l] = f_post[l, index[0], index[1], index[2]]
+                _f_pre[l] = self.compute_dtype(f_pre[l, index[0], index[1], index[2]])
+                _f_post[l] = self.compute_dtype(f_post[l, index[0], index[1], index[2]])
 
                 # TODO fix vec bool
                 if missing_mask[l, index[0], index[1], index[2]]:
diff --git a/xlb/operator/collision/bgk.py b/xlb/operator/collision/bgk.py
index 196e3ba..60f63ef 100644
--- a/xlb/operator/collision/bgk.py
+++ b/xlb/operator/collision/bgk.py
@@ -59,7 +59,7 @@ def kernel2d(
 
             # Write the result
             for l in range(self.velocity_set.q):
-                fout[l, index[0], index[1]] = _fout[l]
+                fout[l, index[0], index[1]] = self.store_dtype(_fout[l])
 
         # Construct the warp kernel
         @wp.kernel
@@ -86,7 +86,7 @@ def kernel3d(
 
             # Write the result
             for l in range(self.velocity_set.q):
-                fout[l, index[0], index[1], index[2]] = _fout[l]
+                fout[l, index[0], index[1], index[2]] = self.store_dtype(_fout[l])
 
         kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
 
diff --git a/xlb/operator/collision/kbc.py b/xlb/operator/collision/kbc.py
index 7984829..bc731c6 100644
--- a/xlb/operator/collision/kbc.py
+++ b/xlb/operator/collision/kbc.py
@@ -337,7 +337,7 @@ def kernel2d(
 
             # Write the result
             for l in range(self.velocity_set.q):
-                fout[l, index[0], index[1]] = _fout[l]
+                fout[l, index[0], index[1]] = self.store_dtype(_fout[l])
 
         # Construct the warp kernel
         @wp.kernel
@@ -369,7 +369,7 @@ def kernel3d(
 
             # Write the result
             for l in range(self.velocity_set.q):
-                fout[l, index[0], index[1], index[2]] = _fout[l]
+                fout[l, index[0], index[1], index[2]] = self.store_dtype(_fout[l])
 
         functional = functional3d if self.velocity_set.d == 3 else functional2d
         kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
diff --git a/xlb/operator/equilibrium/quadratic_equilibrium.py b/xlb/operator/equilibrium/quadratic_equilibrium.py
index 0cce91b..ba337f0 100644
--- a/xlb/operator/equilibrium/quadratic_equilibrium.py
+++ b/xlb/operator/equilibrium/quadratic_equilibrium.py
@@ -79,7 +79,7 @@ def kernel3d(
 
             # Set the output
             for l in range(self.velocity_set.q):
-                f[l, index[0], index[1], index[2]] = feq[l]
+                f[l, index[0], index[1], index[2]] = self.store_dtype(feq[l])
 
         @wp.kernel
         def kernel2d(
@@ -100,7 +100,7 @@ def kernel2d(
 
             # Set the output
             for l in range(self.velocity_set.q):
-                f[l, index[0], index[1]] = feq[l]
+                f[l, index[0], index[1]] = self.store_dtype(feq[l])
 
         kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
 
diff --git a/xlb/operator/force/exact_difference_force.py b/xlb/operator/force/exact_difference_force.py
index f148e12..b4da602 100644
--- a/xlb/operator/force/exact_difference_force.py
+++ b/xlb/operator/force/exact_difference_force.py
@@ -108,7 +108,7 @@ def kernel2d(
 
             # Write the result
             for l in range(self.velocity_set.q):
-                fout[l, index[0], index[1]] = _fout[l]
+                fout[l, index[0], index[1]] = self.store_dtype(_fout[l])
 
         # Construct the warp kernel
         @wp.kernel
@@ -134,7 +134,7 @@ def kernel3d(
 
             # Write the result
             for l in range(self.velocity_set.q):
-                fout[l, index[0], index[1], index[2]] = _fout[l]
+                fout[l, index[0], index[1], index[2]] = self.store_dtype(_fout[l])
 
         kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
         return functional, kernel
diff --git a/xlb/operator/macroscopic/first_moment.py b/xlb/operator/macroscopic/first_moment.py
index 561fe7a..329a71f 100644
--- a/xlb/operator/macroscopic/first_moment.py
+++ b/xlb/operator/macroscopic/first_moment.py
@@ -53,7 +53,7 @@ def kernel3d(
             _u = functional(_f, _rho)
 
             for d in range(self.velocity_set.d):
-                u[d, index[0], index[1], index[2]] = _u[d]
+                u[d, index[0], index[1], index[2]] = self.store_dtype(_u[d])
 
         @wp.kernel
         def kernel2d(
@@ -71,7 +71,7 @@ def kernel2d(
             _u = functional(_f, _rho)
 
             for d in range(self.velocity_set.d):
-                u[d, index[0], index[1]] = _u[d]
+                u[d, index[0], index[1]] = self.store_dtype(_u[d])
 
         kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
 
diff --git a/xlb/operator/macroscopic/macroscopic.py b/xlb/operator/macroscopic/macroscopic.py
index 495a6a0..b574436 100644
--- a/xlb/operator/macroscopic/macroscopic.py
+++ b/xlb/operator/macroscopic/macroscopic.py
@@ -50,9 +50,9 @@ def kernel3d(
                 _f[l] = f[l, index[0], index[1], index[2]]
             _rho, _u = functional(_f)
 
-            rho[0, index[0], index[1], index[2]] = _rho
+            rho[0, index[0], index[1], index[2]] = self.store_dtype(_rho)
             for d in range(self.velocity_set.d):
-                u[d, index[0], index[1], index[2]] = _u[d]
+                u[d, index[0], index[1], index[2]] = self.store_dtype(_u[d])
 
         @wp.kernel
         def kernel2d(
@@ -68,9 +68,9 @@ def kernel2d(
                 _f[l] = f[l, index[0], index[1]]
             _rho, _u = functional(_f)
 
-            rho[0, index[0], index[1]] = _rho
+            rho[0, index[0], index[1]] = self.store_dtype(_rho)
             for d in range(self.velocity_set.d):
-                u[d, index[0], index[1]] = _u[d]
+                u[d, index[0], index[1]] = self.store_dtype(_u[d])
 
         kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
 
diff --git a/xlb/operator/macroscopic/second_moment.py b/xlb/operator/macroscopic/second_moment.py
index 917c86a..687b38a 100644
--- a/xlb/operator/macroscopic/second_moment.py
+++ b/xlb/operator/macroscopic/second_moment.py
@@ -95,7 +95,7 @@ def kernel3d(
 
             # Set the output
             for d in range(_pi_dim):
-                pi[d, index[0], index[1], index[2]] = _pi[d]
+                pi[d, index[0], index[1], index[2]] = self.store_dtype(_pi[d])
 
         @wp.kernel
         def kernel2d(
@@ -114,7 +114,7 @@ def kernel2d(
 
             # Set the output
             for d in range(_pi_dim):
-                pi[d, index[0], index[1]] = _pi[d]
+                pi[d, index[0], index[1]] = self.store_dtype(_pi[d])
 
         kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
 
diff --git a/xlb/operator/stepper/nse_stepper.py b/xlb/operator/stepper/nse_stepper.py
index 18b2167..05cee7b 100644
--- a/xlb/operator/stepper/nse_stepper.py
+++ b/xlb/operator/stepper/nse_stepper.py
@@ -190,18 +190,18 @@ def get_thread_data_2d(
             index: Any,
         ):
             # Get the boundary id and missing mask
-            f_post_collision = _f_vec()
+            _f_post_collision = _f_vec()
             _missing_mask = _missing_mask_vec()
             for l in range(self.velocity_set.q):
                 # q-sized vector of pre-streaming populations
-                f_post_collision[l] = f_0[l, index[0], index[1]]
+                _f_post_collision[l] = self.compute_dtype(f_0[l, index[0], index[1]])
 
                 # TODO fix vec bool
                 if missing_mask[l, index[0], index[1]]:
                     _missing_mask[l] = wp.uint8(1)
                 else:
                     _missing_mask[l] = wp.uint8(0)
-            return f_post_collision, _missing_mask
+            return _f_post_collision, _missing_mask
 
         @wp.func
         def get_thread_data_3d(
@@ -210,18 +210,18 @@ def get_thread_data_3d(
             index: Any,
         ):
             # Get the boundary id and missing mask
-            f_post_collision = _f_vec()
+            _f_post_collision = _f_vec()
             _missing_mask = _missing_mask_vec()
             for l in range(self.velocity_set.q):
                 # q-sized vector of pre-streaming populations
-                f_post_collision[l] = f_0[l, index[0], index[1], index[2]]
+                _f_post_collision[l] = self.compute_dtype(f_0[l, index[0], index[1], index[2]])
 
                 # TODO fix vec bool
                 if missing_mask[l, index[0], index[1], index[2]]:
                     _missing_mask[l] = wp.uint8(1)
                 else:
                     _missing_mask[l] = wp.uint8(0)
-            return f_post_collision, _missing_mask
+            return _f_post_collision, _missing_mask
 
         @wp.func
         def get_bc_auxilary_data_2d(
@@ -243,7 +243,7 @@ def get_bc_auxilary_data_2d(
                         for d in range(self.velocity_set.d):
                             pull_index[d] = index[d] - (_c[d, l] + nv[d])
                         # The following is the post-streaming values of the neighbor cell
-                        f_auxiliary[l] = f_0[l, pull_index[0], pull_index[1]]
+                        f_auxiliary[l] = self.compute_dtype(f_0[l, pull_index[0], pull_index[1]])
             return f_auxiliary
 
         @wp.func
@@ -266,7 +266,7 @@ def get_bc_auxilary_data_3d(
                         for d in range(self.velocity_set.d):
                             pull_index[d] = index[d] - (_c[d, l] + nv[d])
                         # The following is the post-streaming values of the neighbor cell
-                        f_auxiliary[l] = f_0[l, pull_index[0], pull_index[1], pull_index[2]]
+                        f_auxiliary[l] = self.compute_dtype(f_0[l, pull_index[0], pull_index[1], pull_index[2]])
             return f_auxiliary
 
         @wp.kernel
@@ -283,38 +283,33 @@ def kernel2d(
             index = wp.vec2i(i, j)  # TODO warp should fix this
 
             # Read thread data for populations and missing mask
-            f_post_collision, _missing_mask = get_thread_data_2d(f_0, missing_mask, index)
+            _f_post_collision, _missing_mask = get_thread_data_2d(f_0, missing_mask, index)
 
             # Apply streaming (pull method)
-            f_post_stream = self.stream.warp_functional(f_0, index)
+            _f_post_stream = self.stream.warp_functional(f_0, index)
 
             # Prepare auxilary data for BC (if applicable)
             _boundary_id = bc_mask[0, index[0], index[1]]
-            f_auxiliary = get_bc_auxilary_data_2d(f_0, index, _boundary_id, _missing_mask, bc_struct)
+            _f_auxiliary = get_bc_auxilary_data_2d(f_0, index, _boundary_id, _missing_mask, bc_struct)
 
             # Apply post-streaming type boundary conditions
-            f_post_stream = apply_post_streaming_bc(f_post_collision, f_post_stream, f_auxiliary, _missing_mask, _boundary_id, bc_struct)
+            _f_post_stream = apply_post_streaming_bc(_f_post_collision, _f_post_stream, _f_auxiliary, _missing_mask, _boundary_id, bc_struct)
 
             # Compute rho and u
-            rho, u = self.macroscopic.warp_functional(f_post_stream)
+            _rho, _u = self.macroscopic.warp_functional(_f_post_stream)
 
             # Compute equilibrium
-            feq = self.equilibrium.warp_functional(rho, u)
+            _feq = self.equilibrium.warp_functional(_rho, _u)
 
             # Apply collision
-            f_post_collision = self.collision.warp_functional(
-                f_post_stream,
-                feq,
-                rho,
-                u,
-            )
+            _f_post_collision = self.collision.warp_functional(_f_post_stream, _feq, _rho, _u)
 
             # Apply post-collision type boundary conditions
-            f_post_collision = apply_post_collision_bc(f_post_stream, f_post_collision, f_auxiliary, _missing_mask, _boundary_id, bc_struct)
+            _f_post_collision = apply_post_collision_bc(_f_post_stream, _f_post_collision, _f_auxiliary, _missing_mask, _boundary_id, bc_struct)
 
             # Set the output
             for l in range(self.velocity_set.q):
-                f_1[l, index[0], index[1]] = f_post_collision[l]
+                f_1[l, index[0], index[1]] = self.store_dtype(_f_post_collision[l])
 
         # Construct the kernel
         @wp.kernel
@@ -331,33 +326,33 @@ def kernel3d(
             index = wp.vec3i(i, j, k)  # TODO warp should fix this
 
             # Read thread data for populations and missing mask
-            f_post_collision, _missing_mask = get_thread_data_3d(f_0, missing_mask, index)
+            _f_post_collision, _missing_mask = get_thread_data_3d(f_0, missing_mask, index)
 
             # Apply streaming (pull method)
-            f_post_stream = self.stream.warp_functional(f_0, index)
+            _f_post_stream = self.stream.warp_functional(f_0, index)
 
             # Prepare auxilary data for BC (if applicable)
             _boundary_id = bc_mask[0, index[0], index[1], index[2]]
-            f_auxiliary = get_bc_auxilary_data_3d(f_0, index, _boundary_id, _missing_mask, bc_struct)
+            _f_auxiliary = get_bc_auxilary_data_3d(f_0, index, _boundary_id, _missing_mask, bc_struct)
 
             # Apply post-streaming type boundary conditions
-            f_post_stream = apply_post_streaming_bc(f_post_collision, f_post_stream, f_auxiliary, _missing_mask, _boundary_id, bc_struct)
+            _f_post_stream = apply_post_streaming_bc(_f_post_collision, _f_post_stream, _f_auxiliary, _missing_mask, _boundary_id, bc_struct)
 
             # Compute rho and u
-            rho, u = self.macroscopic.warp_functional(f_post_stream)
+            _rho, _u = self.macroscopic.warp_functional(_f_post_stream)
 
             # Compute equilibrium
-            feq = self.equilibrium.warp_functional(rho, u)
+            _feq = self.equilibrium.warp_functional(_rho, _u)
 
             # Apply collision
-            f_post_collision = self.collision.warp_functional(f_post_stream, feq, rho, u)
+            _f_post_collision = self.collision.warp_functional(_f_post_stream, _feq, _rho, _u)
 
             # Apply post-collision type boundary conditions
-            f_post_collision = apply_post_collision_bc(f_post_stream, f_post_collision, f_auxiliary, _missing_mask, _boundary_id, bc_struct)
+            _f_post_collision = apply_post_collision_bc(_f_post_stream, _f_post_collision, _f_auxiliary, _missing_mask, _boundary_id, bc_struct)
 
             # Set the output
             for l in range(self.velocity_set.q):
-                f_1[l, index[0], index[1], index[2]] = f_post_collision[l]
+                f_1[l, index[0], index[1], index[2]] = self.store_dtype(_f_post_collision[l])
 
         # Return the correct kernel
         kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
diff --git a/xlb/operator/stream/stream.py b/xlb/operator/stream/stream.py
index d96c307..dc2417a 100644
--- a/xlb/operator/stream/stream.py
+++ b/xlb/operator/stream/stream.py
@@ -76,7 +76,7 @@ def functional2d(
                         pull_index[d] = 0
 
                 # Read the distribution function
-                _f[l] = f[l, pull_index[0], pull_index[1]]
+                _f[l] = self.compute_dtype(f[l, pull_index[0], pull_index[1]])
 
             return _f
 
@@ -94,7 +94,7 @@ def kernel2d(
 
             # Write the output
             for l in range(self.velocity_set.q):
-                f_1[l, index[0], index[1]] = _f[l]
+                f_1[l, index[0], index[1]] = self.store_dtype(_f[l])
 
         # Construct the funcional to get streamed indices
         @wp.func
@@ -117,7 +117,8 @@ def functional3d(
                         pull_index[d] = 0
 
                 # Read the distribution function
-                _f[l] = f[l, pull_index[0], pull_index[1], pull_index[2]]
+                # Unlike other functionals, we need to cast the type here since we read from the buffer
+                _f[l] = self.compute_dtype(f[l, pull_index[0], pull_index[1], pull_index[2]])
 
             return _f
 
@@ -136,7 +137,7 @@ def kernel3d(
 
             # Write the output
             for l in range(self.velocity_set.q):
-                f_1[l, index[0], index[1], index[2]] = _f[l]
+                f_1[l, index[0], index[1], index[2]] = self.store_dtype(_f[l])
 
         functional = functional3d if self.velocity_set.d == 3 else functional2d
         kernel = kernel3d if self.velocity_set.d == 3 else kernel2d
diff --git a/xlb/precision_policy.py b/xlb/precision_policy.py
index 3b0f85f..d85deed 100644
--- a/xlb/precision_policy.py
+++ b/xlb/precision_policy.py
@@ -1,7 +1,6 @@
 # Enum for precision policy
 
 from enum import Enum, auto
-
 import jax.numpy as jnp
 import warp as wp
 
@@ -87,12 +86,4 @@ def cast_to_compute_jax(self, array):
 
     def cast_to_store_jax(self, array):
         store_precision = self.store_precision
-        return jnp.array(array, dtype=store_precision.jax_dtype)
-
-    def cast_to_compute_warp(self, array):
-        compute_precision = self.compute_precision
-        return wp.array(array, dtype=compute_precision.wp_dtype)
-
-    def cast_to_store_warp(self, array):
-        store_precision = self.store_precision
-        return wp.array(array, dtype=store_precision.wp_dtype)
+        return jnp.array(array, dtype=store_precision.jax_dtype)
\ No newline at end of file