diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 9341352d..046efd7d 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,7 +1,7 @@
 // devcontainer.json
     {
       "name": "athenapk-dev",
-      "image": "ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent",
+      "image": "ghcr.io/parthenon-hpc-lab/cuda11.6-noascent",
       // disable Dockerfile for now
       //"build": {
       //  // Path is relative to the devcontainer.json file.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1d8fbe03..8fcfcba2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -42,7 +42,7 @@ jobs:
           cd build
           # Pick GPU with most available memory
           export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }')
-          ctest -L ${{ matrix.parallel }}
+          ctest -L ${{ matrix.parallel }} --timeout 3600
       - uses: actions/upload-artifact@v3
         if: ${{ always() }}
         with:
@@ -57,6 +57,8 @@ jobs:
             build/tst/regression/outputs/cluster_tabular_cooling/convergence.png
             build/tst/regression/outputs/aniso_therm_cond_ring_conv/ring_convergence.png
             build/tst/regression/outputs/aniso_therm_cond_gauss_conv/cond.png
+            build/tst/regression/outputs/diffusion/ohm.png
+            build/tst/regression/outputs/diffusion/visc.png
             build/tst/regression/outputs/field_loop/field_loop.png
             build/tst/regression/outputs/riemann_hydro/shock_tube.png
             build/tst/regression/outputs/turbulence/parthenon.out1.hst
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e5957e43..fc2af035 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,15 +3,22 @@
 ## Current develop (i.e., `main` branch)
 
 ### Added (new features/APIs/variables/...)
+- [[PR 89]](https://github.com/parthenon-hpc-lab/athenapk/pull/89) Add viscosity and resistivity
 - [[PR 1]](https://github.com/parthenon-hpc-lab/athenapk/pull/1) Add isotropic thermal conduction and RKL2 supertimestepping
 
 ### Changed (changing behavior/API/variables/...)
+- [[PR 122]](https://github.com/parthenon-hpc-lab/athenapk/pull/122) Fixed sqrt(4pi) factor in CGS Gauss unit and add unit doc
+- [[PR 119]](https://github.com/parthenon-hpc-lab/athenapk/pull/119) Fixed Athena++ paper test case for KHI pgen. Added turbulence pgen doc.
 - [[PR 97]](https://github.com/parthenon-hpc-lab/athenapk/pull/97) Fixed Schure cooling curve. Removed SD one. Added description of cooling function conventions.
 - [[PR 84]](https://github.com/parthenon-hpc-lab/athenapk/pull/84) Bump Parthenon to latest develop (2024-02-15)
 
 ### Fixed (not changing behavior/API/variables/...)
+- [[PR 128]](https://github.com/parthenon-hpc-lab/athenapk/pull/128) Fixed `dt_diff` in RKL2
 
 ### Infrastructure
+- [[PR 128]](https://github.com/parthenon-hpc-lab/athenapk/pull/128) Bump Parthenon to support `dn` based outputs
+- [[PR 124]](https://github.com/parthenon-hpc-lab/athenapk/pull/124) Bump Kokkos 4.4.1 (and Parthenon to include view-of-view fix)
+- [[PR 117]](https://github.com/parthenon-hpc-lab/athenapk/pull/117) Update devcontainer.json to latest CI container
 - [[PR 114]](https://github.com/parthenon-hpc-lab/athenapk/pull/114) Bump Parthenon 24.08 and Kokkos to 4.4.00
 - [[PR 112]](https://github.com/parthenon-hpc-lab/athenapk/pull/112) Add dev container configuration
 - [[PR 105]](https://github.com/parthenon-hpc-lab/athenapk/pull/105) Bump Parthenon to latest develop (2024-03-13)
@@ -20,6 +27,8 @@
 ### Removed (removing behavior/API/varaibles/...)
 
 ### Incompatibilities (i.e. breaking changes)
+- [[PR 124]](https://github.com/parthenon-hpc-lab/athenapk/pull/124) Enrolling custom boundary conditions changed
+  - Boundary conditions can now be enrolled using a string that can be subsequently be used in the input file (see, e.g., cloud problem generator)
 - [[PR 114]](https://github.com/parthenon-hpc-lab/athenapk/pull/114) Bump Parthenon 24.08 and Kokkos to 4.4.00
   - Changed signature of `UserWorkBeforeOutput` to include `SimTime` as last paramter
   - Fixes bitwise idential restarts for AMR simulations (the derefinement counter is now included)
diff --git a/README.md b/README.md
index 13d7280a..882d9625 100644
--- a/README.md
+++ b/README.md
@@ -14,8 +14,13 @@ Current features include
   - HLLE (hydro and MHD), HLLC (hydro), and HLLD (MHD) Riemann solvers
   - adiabatic equation of state
   - MHD based on hyperbolic divergence cleaning following Dedner+ 2002
-  - isotropic and anisotropic thermal conduction
-  - operator-split, second-order RKL2 supertimestepping for diffusive terms
+  - diffusion processes
+    - isotropic and anisotropic thermal conduction
+    - viscosity
+    - resistivity
+  - diffusion integrator
+    - unsplit
+    - operator-split, second-order RKL2 supertimestepping
   - optically thin cooling based on tabulated cooling tables with either Townsend 2009 exact integration or operator-split subcycling
 - static and adaptive mesh refinement
 - problem generators for
@@ -126,21 +131,8 @@ the `file_type = hdf5` format, see
 [VisIt](https://wci.llnl.gov/simulation/computer-codes/visit/).
 In ParaView, select the "XDMF Reader" when prompted.
 
-2. With [yt](https://yt-project.org/) -- though currently through a custom frontend
-that is not yet part of the main yt branch and, thus, has to be installed manually, e.g.,
-as follows:
-```bash
-cd ~/src # or any other folder of choice
-git clone https://github.com/forrestglines/yt.git
-cd yt
-git checkout parthenon-frontend
-
-# If you're using conda or virtualenv
-pip install -e .
-# OR alternatively, if you using the plain Python environment
-pip install --user -e .
-```
-Afterwards, `*.phdf` files can be read as usual with `yt.load()`.
+2. With [yt](https://yt-project.org/)
+As of versions >=4.4 `*.phdf` files can be read as usual with `yt.load()`.
 
 3. Using [Ascent](https://github.com/Alpine-DAV/ascent) (for in situ visualization and analysis).
 This requires Ascent to be installed/available at compile time of AthenaPK.
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 00000000..145cc0f8
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,37 @@
+# AthenaPK documentation
+
+Note that we're aware the rendering of equations in markdown on GitHub for the documentation
+is currently not great.
+Eventually, the docs will transition to Read the Docs (or similar).
+
+## Overview
+
+The documentation currently includes
+
+- [Configuring solvers in the input file](input.md)
+  - [An extended overview of various conventions for optically thin cooling implementations](cooling_notes.md)
+  - [Notebooks to calculate cooling tables from literature](cooling)
+- [Brief notes on developing code for AthenaPK](development.md)
+- [How to add a custom/user problem generator](pgen.md)
+- [Units](units.md)
+- Detailed descriptions of more complex problem generators
+  - [Galaxy Cluster and Cluster-like Problem Setup](cluster.md)
+  - [Driven turbulence](turbulence.md)
+
+## Tutorial
+
+An AthenaPK tutorial was given as part of the **Towards exascale-ready astrophysics**
+workshop https://indico3-jsc.fz-juelich.de/event/169/ taking place 25-27 Sep 2024 online.
+
+The material is currently located at https://github.com/pgrete/athenapk_tutorial
+
+While the instructions for building the code are specific to the workshop environment
+the tutorial itself should translate directly to other environments/systems.
+
+## Parthenon documenation
+
+Many paramters/options are directly controlled through the Parthenon framework
+(both with regard to building and in the input file).
+
+While the [Parthenon documenation](https://parthenon-hpc-lab.github.io/parthenon) is
+more geared towards developers it also contains useful information for users.
\ No newline at end of file
diff --git a/docs/img/turb_acc.png b/docs/img/turb_acc.png
new file mode 100644
index 00000000..b828a66e
Binary files /dev/null and b/docs/img/turb_acc.png differ
diff --git a/docs/img/turb_evol.png b/docs/img/turb_evol.png
new file mode 100644
index 00000000..de7aa7b7
Binary files /dev/null and b/docs/img/turb_evol.png differ
diff --git a/docs/img/turb_spec.png b/docs/img/turb_spec.png
new file mode 100644
index 00000000..e6a8c565
Binary files /dev/null and b/docs/img/turb_spec.png differ
diff --git a/docs/input.md b/docs/input.md
index c51aa8e7..0395a448 100644
--- a/docs/input.md
+++ b/docs/input.md
@@ -67,8 +67,65 @@ To control the floors, following parameters can be set in the `<hydro>` block:
 *Note* the pressure floor will take precedence over the temperature floor in the
 conserved to primitive conversion if both are defined.
 
+#### Units
+
+See(here)[units.md].
+
 #### Diffusive processes
 
+Diffusive processes in AthenaPK can be configured in the `<diffusion>` block of the input file.
+```
+<diffusion>
+integrator = unsplit       # alternatively: rkl2 (for rkl2 integrator (operator split integrator)
+#rkl2_max_dt_ratio = 100.0 # limits the ratio between the parabolic and hyperbolic timesteps (only used for RKL2 operator split integrator)
+#cfl = 1.0                 # Additional safety factor applied in the caluclation of the diffusive timestep (used in both unsplit and RKL2 integration schemes). Defaults to hyperbolic cfl.
+
+conduction = anisotropic               # none (disabled), or isotropic, or anisotropic
+conduction_coeff = fixed               # alternative: spitzer
+thermal_diff_coeff_code = 0.01         # fixed coefficent in code units (code_length^2/code_time)
+#spitzer_cond_in_erg_by_s_K_cm = 4.6e7 # spitzer coefficient in cgs units (requires definition of a unit system)
+#conduction_sat_phi = 0.3              # fudge factor to account for uncertainties in saturated fluxes
+
+
+viscosity = none            # none (disabled) or isotropic
+viscosity_coeff = fixed
+mom_diff_coeff_code = 0.25  # fixed coefficent of the kinetmatic viscosity in code units (code_length^2/code_time)
+
+resistivity = none          # none (disabled) or ohmic
+resistivity_coeff = fixed
+ohm_diff_coeff_code = 0.25  # fixed coefficent of the magnetic (ohmic) diffusivity code units (code_length^2/code_time)
+```
+(An)isotropic thermal conduction (with fixed or Spitzer coefficient), and isotropic viscosity and
+resistivity with fixed coefficient are currently implemented.
+They can be integrated in an unsplit manner or operator split using a second-order accurate RKL2
+supertimestepping algorithm.
+More details are described in the following.
+
+#### Integrators
+
+Diffusive processes can be integrated in either an unsplit
+fashion (`diffusion/integrator=unsplit`) or operator split using a second-order accurate
+RKL2 super timestepping algorithm (`diffusion/integrator=rkl2`) following [^M+14].
+
+In the unsplit case, the diffusive processes are included at the end of every stage in
+the main integration loop and the global timestep is limited accordingly.
+A separate CFL can be set for the diffusive processes via `diffusion/cfl=...`, which
+defaults to the hyperbolic value if not set.
+
+In the RKL2 case, the global timestep is not limited by the diffusive processes by default.
+However, as reported by [^V+17] a large number of stages
+($`s \approx \sqrt(\Delta t_{hyp}/\Delta t_{par}) \geq 20`$) in the supertimestepping
+(in combination with anisotropic, limited diffusion) may lead to a loss in accuracy, which
+is why the difference between hyperbolic and parabolic timesteps can be limited by
+`diffusion/rkl2_max_dt_ratio=...` and a warning is shown if the ratio is above 400.
+
+[^M+14]:
+    C. D. Meyer, D. S. Balsara, and T. D. Aslam, “A stabilized Runge–Kutta–Legendre method for explicit super-time-stepping of parabolic and mixed equations,” Journal of Computational Physics, vol. 257, pp. 594–626, 2014, doi: https://doi.org/10.1016/j.jcp.2013.08.021.
+
+[^V+17]:
+    B. Vaidya, D. Prasad, A. Mignone, P. Sharma, and L. Rickler, “Scalable explicit implementation of anisotropic diffusion with Runge–Kutta–Legendre super-time stepping,” Monthly Notices of the Royal Astronomical Society, vol. 472, no. 3, pp. 3147–3160, 2017, doi: 10.1093/mnras/stx2176.
+
+
 ##### Isotropic (hydro and MHD) and anisotropic thermal conduction (only MHD)
 In the presence of magnetic fields thermal conduction is becoming anisotropic with the flux along
 the local magnetic field direction typically being much stronger than the flux perpendicular to the magnetic field.
@@ -140,6 +197,29 @@ Default value corresponds to the typical value used in literature and goes back
 [^BM82]:
     S. A. Balbus and C. F. McKee, “The evaporation of spherical clouds in a hot gas. III - Suprathermal evaporation,” , vol. 252, pp. 529–552, Jan. 1982, doi: https://doi.org/10.1086/159581
 
+#### Viscosity/Momentum diffusion
+
+Only isotropic viscosity with a (spatially and temporally) fixed coefficient in code units
+(`code_length`^2/`code_time`) is currently implemented.
+To enable set (in the `<diffusion>` block)
+```
+viscosity = isotropic
+viscosity_coeff = fixed
+mom_diff_coeff_code = 0.25  # fixed coefficent of the kinetmatic viscosity in code units (code_length^2/code_time)
+```
+
+#### Resistivity/Ohmic diffusion
+
+Only resistivity with a (spatially and temporally) fixed coefficient in code units
+(`code_length`^2/`code_time`)is currently implemented.
+To enable set (in the `<diffusion>` block)
+```
+resistivity = ohmic
+resistivity_coeff = fixed
+ohm_diff_coeff_code = 0.25  # fixed coefficent of the magnetic (ohmic) diffusivity code units (code_length^2/code_time)
+```
+
+
 ### Additional MHD options in `<hydro>` block
 
 Parameter: `glmmhd_source` (string)
diff --git a/docs/pgen.md b/docs/pgen.md
index 0b58b9a2..9a2f58e8 100644
--- a/docs/pgen.md
+++ b/docs/pgen.md
@@ -1,4 +1,4 @@
-# Problem generators
+# Custom problem generators
 
 Different simulation setups in AthenaPK are controlled via the so-called problem generators.
 New problem generators can easily be added and we are happy to accept and merge contibuted problem
diff --git a/docs/turbulence.md b/docs/turbulence.md
new file mode 100644
index 00000000..b51b4b70
--- /dev/null
+++ b/docs/turbulence.md
@@ -0,0 +1,107 @@
+# Driven turbulence simulations
+
+The turbulence problem generator uses explicit inverse Fourier transformations (iFTs)
+on each meshblock in order to reduce communication during the iFT.
+Thus, it is only efficient if comparatively few modes are used (say < 100).
+
+Quite generally, driven turbulence simulations start from uniform initial conditions
+(uniform density and pressure, some initial magnetic field configuration in case of an
+MHD setup, and the fluid at rest) and reach a state of stationary, isotropic (or anisotropic
+depending on the strength of the background magnetic field) turbulence after one to few
+large eddy turnover times (again depending on the background magnetic field strength).
+The large eddy turnover time is usually defined as `T = L/U` with `L` being the scale
+of the largest eddies and `U` the root mean square Mach number in the stationary regime.
+
+The current implementation uses the following forcing spectrum
+`(k/k_peak)^2 * (2 - (k/k_peak)^2)`.
+Here, `k_peak` is the peak wavenumber of the forcing spectrum. It is related the scales of the largest eddies as
+`L = 1/k_f` given that a box size of 1 is currently assumed/hardcoded.
+
+## Problem setup
+
+An example parameter file can be found in `inputs/turbulence.in`.
+
+A typical setup contains the following blocks in the input file:
+
+```
+<job>
+problem_id = turbulence
+
+<problem/turbulence>
+rho0         = 1.0      # initial mean density
+p0           = 1.0      # initial mean pressure
+b0           = 0.01     # initial magnetic field strength
+b_config     = 0        # 0 - net flux; 1 - no net flux uniform B; 2 - non net flux sin B; 4 - field loop
+kpeak        = 2.0      # characteristic wavenumber
+corr_time    = 1.0      # autocorrelation time of the OU forcing process
+rseed        = 20190729 # random seed of the OU forcing process
+sol_weight   = 1.0      # solenoidal weight of the acceleration field
+accel_rms    = 0.5      # root mean square value of the acceleration field
+num_modes    = 30       # number of wavemodes
+
+<modes>
+k_1_0	= +2
+k_1_1	= -1
+k_1_2	= +0
+k_2_0	= +1
+...
+```
+
+The following parameters can be changed to control both the initial state:
+
+- `rho0` initial mean density
+- `p0` initial mean thermal pressure
+- `b0` initial mean magnetic field strength
+- `b_config`
+  - `0`: net flux case (uniform B_x)
+  - `1`: no net flux case (uniform B_x with changing sign in each half of the box)
+  - `2`: no net flux with initial sinosoidal B_x field
+  - `3`: deprecated
+  - `4`: closed field loop/cylinder in the box (in x-y plane) located at
+    - `x0=0.5` (default)
+    - `y0=0.5` (default)
+    - `z0=0.5` (default)
+    - and radius `loop_rad=0.25`
+
+as well as the driving field:
+
+- `kpeak` peak wavenumber of the forcing spectrum. Make sure to update the wavemodes to match `kpeak`, see below.
+- `corr_time` autocorrelation time of the acceleration field (in code units).
+Using delta-in-time forcing, i.e., a very low value, is discouraged, see [Grete et al. 2018 ApJL](https://iopscience.iop.org/article/10.3847/2041-8213/aac0f5).
+- `rseed` random seed for the OU process. Only change for new simulation, but keep unchanged for restarting simulations.
+- `sol_weight` solenoidal weight of the acceleration field. `1.0` is purely solenoidal/rotational and `0.0` is purely dilatational/compressive. Any value between `0.0` and `1.0` is possible. The parameter is related to the resulting rotational power in the 3D acceleration field as
+`1. - ((1-sol_weight)^2/(1-2*sol_weight+3*sol_weight^2))`, see eq (9) in [Federrath et al. 2010 A&A](
+https://doi.org/10.1051/0004-6361/200912437).
+- `accel_rms` root mean square value of the acceleration (controls the "strength" of the forcing field)
+- `num_modes` number of wavemodes that are specified in the `<modes>` section of the parameter file.
+The modes are specified manually as an explicit inverse FT is performed and only modes set are included (all others are assumed to be 0).
+This is done to make the global inverse FT possible without any
+expensive communication between blocks but this becomes excessively
+expensiv for large number of modes.
+Typically using a few tens of modes is a good choice in practice.
+In order to generate a set of modes run the `inputs/generate_fmturb_modes.py` script and replace
+the corresponding parts of the parameter file with the output of the script.
+Within the script, the top three variables (`k_peak`, `k_high`, and `k_low`) need to be adjusted in
+order to generate a complete set (i.e., all) of wavemodes.
+Important, the `k_peak` in the script should match the `k_peak` set
+in the input file.
+Alternatively, wavemodes can be chosen/defined manually, e.g., if not all wavemodes are desired or
+only individual modes should be forced.
+
+## Typical results
+
+The results shown here are obtained from running simulations with the parameters given in the next section.
+
+### High level temporal evolution
+![image](img/turb_evol.png)
+
+### Power spectra
+![image](img/turb_spec.png)
+
+### Consistency of acceleration field
+As each meshblock does a full iFT of all modes the following slices from a run with 8 meshblocks 
+illustrate that there is no discontinuities at the meshblock boundary.
+
+Plot shows x-, y-, and z-acceleration (in rows top to bottom) slices in the x-, y-, and z-direction (in columns from left to right).
+
+![image](img/turb_acc.png)
\ No newline at end of file
diff --git a/docs/units.md b/docs/units.md
new file mode 100644
index 00000000..b26ccf6e
--- /dev/null
+++ b/docs/units.md
@@ -0,0 +1,58 @@
+# AthenaPK units
+
+## General unit system
+
+Internally, all calculations are done in "code units" and there are no conversions between
+code and physical units during runtime (with excetion like the temperature when cooling is
+being used).
+Therefore, in general no units need to be prescribed to run a simulation.
+
+If units are required (e.g., if cooling is used and, thus, a conversion between internal energy
+in code units and physical temperature is required) they are configured in the input block
+as follows:
+
+```
+<units>
+code_length_cgs = 3.085677580962325e+24 # 1 Mpc in cm
+code_mass_cgs = 1.98841586e+47          # 1e14 Msun in g
+code_time_cgs = 3.15576e+16             # 1 Gyr in s
+```
+
+This information will also be used by postprocessing tools (like yt) to convert between
+code units and a physical unit system (like cgs).
+
+Moreover, internally a set of factors from code to cgs units are available to process conversions
+if required (e.g., from the input file).
+
+For example, for an input parameter (in the input file) like
+
+```
+<problem/cloud>
+r0_cgs = 3.085677580962325e+20          # 100 pc
+```
+
+the conversion should happen in the problem generator lik
+
+```c++
+  r_cloud = pin->GetReal("problem/cloud", "r0_cgs") / units.code_length_cgs();
+```
+
+so that the resulting quantity is internally in code units (here code length).
+
+It is highly recommended to be *very* explicit/specific about units everywhere (as it is
+a common source of confusion) like adding the `_cgs` suffix to the parameter in the
+input file above.
+
+## Magnetic units
+
+Internally, AthenaPK (and almost all MHD codes) use
+[Heaviside-Lorentz units](https://en.wikipedia.org/wiki/Heaviside%E2%80%93Lorentz_units),
+where the magnetic field is transformed from $B \rightarrow B / \sqrt{4 \pi}$.
+(See also the note in the
+[Castro documentation](https://amrex-astro.github.io/Castro/docs/mhd.html) about this.)
+
+So when converting from CGS-Gaussian units to code units, it is necessary to divide
+by $\sqrt{4 \pi}$ (in addition to the base dimensional factors).
+This is automatically handled by the `units.code_magnetic_cgs()` factors.
+
+
diff --git a/external/Kokkos b/external/Kokkos
index 08ceff92..15dc143e 160000
--- a/external/Kokkos
+++ b/external/Kokkos
@@ -1 +1 @@
-Subproject commit 08ceff92bcf3a828844480bc1e6137eb74028517
+Subproject commit 15dc143e5f39949eece972a798e175c4b463d4b8
diff --git a/external/parthenon b/external/parthenon
index ec61c9cb..07948ef1 160000
--- a/external/parthenon
+++ b/external/parthenon
@@ -1 +1 @@
-Subproject commit ec61c9cb102d6a67a4dd03e7d7fa4e10c82c1032
+Subproject commit 07948ef150b7146acdd270b70c43701f3f26dd4c
diff --git a/inputs/cloud.in b/inputs/cloud.in
index 002febd0..e7e05f69 100644
--- a/inputs/cloud.in
+++ b/inputs/cloud.in
@@ -22,7 +22,7 @@ ox1_bc = outflow
 nx2 = 320
 x2min = -400
 x2max = 2100
-ix2_bc = user
+ix2_bc = cloud_inflow_x2
 ox2_bc = outflow
 
 nx3 = 192
diff --git a/inputs/diffusion.in b/inputs/diffusion.in
index 44d02bcb..915dd088 100644
--- a/inputs/diffusion.in
+++ b/inputs/diffusion.in
@@ -1,9 +1,9 @@
 # AthenaPK - a performance portable block structured AMR MHD code
-# Copyright (c) 2021, Athena Parthenon Collaboration. All rights reserved.
+# Copyright (c) 2021-2024, Athena Parthenon Collaboration. All rights reserved.
 # Licensed under the BSD 3-Clause License (the "LICENSE");
 
 <comment>
-problem   = Thermal diffusion setup
+problem   = Diffusion setup (for thermal, momentum and Ohmic diffusion tests)
 
 <job>
 problem_id = diffusion
@@ -60,11 +60,17 @@ reconstruction = dc
 gamma = 2.0
 
 <diffusion>
-integrator = unsplit
+integrator = rkl2
 conduction = anisotropic
 conduction_coeff = fixed
 thermal_diff_coeff_code = 0.01
-rkl2_max_dt_ratio = 400.0
+viscosity = none            # none (disabled), isotropic, or anisotropic
+viscosity_coeff = fixed
+mom_diff_coeff_code = 0.25
+resistivity = none          # none (disabled) or ohmic
+resistivity_coeff = fixed
+ohm_diff_coeff_code = 0.25
+rkl2_max_dt_ratio = 100.0
 
 <parthenon/output0>
 file_type = hdf5
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ddcf09b7..ae2a521a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -7,9 +7,11 @@ add_executable(
         eos/adiabatic_glmmhd.cpp
         units.hpp
         eos/adiabatic_hydro.cpp
+        hydro/diffusion/conduction.cpp
         hydro/diffusion/diffusion.cpp
         hydro/diffusion/diffusion.hpp
-        hydro/diffusion/conduction.cpp
+        hydro/diffusion/resistivity.cpp
+        hydro/diffusion/viscosity.cpp
         hydro/hydro_driver.cpp
         hydro/hydro.cpp
         hydro/glmmhd/dedner_source.cpp
diff --git a/src/hydro/diffusion/conduction.cpp b/src/hydro/diffusion/conduction.cpp
index d6e57d30..40256024 100644
--- a/src/hydro/diffusion/conduction.cpp
+++ b/src/hydro/diffusion/conduction.cpp
@@ -179,8 +179,8 @@ Real EstimateConductionTimestep(MeshData<Real> *md) {
         },
         Kokkos::Min<Real>(min_dt_cond));
   }
-
-  return fac * min_dt_cond;
+  const auto &cfl_diff = hydro_pkg->Param<Real>("cfl_diff");
+  return cfl_diff * fac * min_dt_cond;
 }
 
 //---------------------------------------------------------------------------------------
diff --git a/src/hydro/diffusion/diffusion.cpp b/src/hydro/diffusion/diffusion.cpp
index 20617ca2..d8b1c59d 100644
--- a/src/hydro/diffusion/diffusion.cpp
+++ b/src/hydro/diffusion/diffusion.cpp
@@ -1,6 +1,6 @@
 //========================================================================================
 // AthenaPK - a performance portable block structured AMR astrophysical MHD code.
-// Copyright (c) 2021, Athena-Parthenon Collaboration. All rights reserved.
+// Copyright (c) 2021-2023, Athena-Parthenon Collaboration. All rights reserved.
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
 //! \file diffusion.cpp
@@ -27,5 +27,27 @@ TaskStatus CalcDiffFluxes(StateDescriptor *hydro_pkg, MeshData<Real> *md) {
       ThermalFluxGeneral(md);
     }
   }
+  const auto &viscosity = hydro_pkg->Param<Viscosity>("viscosity");
+  if (viscosity != Viscosity::none) {
+    const auto &mom_diff = hydro_pkg->Param<MomentumDiffusivity>("mom_diff");
+
+    if (viscosity == Viscosity::isotropic &&
+        mom_diff.GetCoeffType() == ViscosityCoeff::fixed) {
+      MomentumDiffFluxIsoFixed(md);
+    } else {
+      MomentumDiffFluxGeneral(md);
+    }
+  }
+  const auto &resistivity = hydro_pkg->Param<Resistivity>("resistivity");
+  if (resistivity != Resistivity::none) {
+    const auto &ohm_diff = hydro_pkg->Param<OhmicDiffusivity>("ohm_diff");
+
+    if (resistivity == Resistivity::ohmic &&
+        ohm_diff.GetCoeffType() == ResistivityCoeff::fixed) {
+      OhmicDiffFluxIsoFixed(md);
+    } else {
+      OhmicDiffFluxGeneral(md);
+    }
+  }
   return TaskStatus::complete;
 }
diff --git a/src/hydro/diffusion/diffusion.hpp b/src/hydro/diffusion/diffusion.hpp
index 7d522902..9e03f25a 100644
--- a/src/hydro/diffusion/diffusion.hpp
+++ b/src/hydro/diffusion/diffusion.hpp
@@ -99,6 +99,71 @@ void ThermalFluxIsoFixed(MeshData<Real> *md);
 //! Calculate thermal conduction (general case incl. anisotropic and saturated)
 void ThermalFluxGeneral(MeshData<Real> *md);
 
+struct MomentumDiffusivity {
+ private:
+  Real mbar_, me_, kb_;
+  Viscosity viscosity_;
+  ViscosityCoeff viscosity_coeff_type_;
+  // "free" coefficient/prefactor. Value depends on viscosity set in the constructor.
+  Real coeff_;
+
+ public:
+  KOKKOS_INLINE_FUNCTION
+  MomentumDiffusivity(Viscosity viscosity, ViscosityCoeff viscosity_coeff_type,
+                      Real coeff, Real mbar, Real me, Real kb)
+      : viscosity_(viscosity), viscosity_coeff_type_(viscosity_coeff_type), coeff_(coeff),
+        mbar_(mbar), me_(me), kb_(kb) {}
+
+  KOKKOS_INLINE_FUNCTION
+  Real Get(const Real pres, const Real rho) const;
+
+  KOKKOS_INLINE_FUNCTION
+  Viscosity GetType() const { return viscosity_; }
+
+  KOKKOS_INLINE_FUNCTION
+  ViscosityCoeff GetCoeffType() const { return viscosity_coeff_type_; }
+};
+
+Real EstimateViscosityTimestep(MeshData<Real> *md);
+
+//! Calculate isotropic viscosity with fixed coefficient
+void MomentumDiffFluxIsoFixed(MeshData<Real> *md);
+//! Calculate viscosity (general case incl. anisotropic)
+void MomentumDiffFluxGeneral(MeshData<Real> *md);
+
+struct OhmicDiffusivity {
+ private:
+  Real mbar_, me_, kb_;
+  Resistivity resistivity_;
+  ResistivityCoeff resistivity_coeff_type_;
+  // "free" coefficient/prefactor. Value depends on resistivity set in the constructor.
+  Real coeff_;
+
+ public:
+  KOKKOS_INLINE_FUNCTION
+  OhmicDiffusivity(Resistivity resistivity, ResistivityCoeff resistivity_coeff_type,
+                   Real coeff, Real mbar, Real me, Real kb)
+      : resistivity_(resistivity), resistivity_coeff_type_(resistivity_coeff_type),
+        coeff_(coeff), mbar_(mbar), me_(me), kb_(kb) {}
+
+  KOKKOS_INLINE_FUNCTION
+  Real Get(const Real pres, const Real rho) const;
+
+  KOKKOS_INLINE_FUNCTION
+  Resistivity GetType() const { return resistivity_; }
+
+  KOKKOS_INLINE_FUNCTION
+  ResistivityCoeff GetCoeffType() const { return resistivity_coeff_type_; }
+};
+
+Real EstimateResistivityTimestep(MeshData<Real> *md);
+
+//! Calculate isotropic resistivity with fixed coefficient
+void OhmicDiffFluxIsoFixed(MeshData<Real> *md);
+
+//! Calculate resistivity (general case incl. Spitzer)
+void OhmicDiffFluxGeneral(MeshData<Real> *md);
+
 // Calculate all diffusion fluxes, i.e., update the .flux views in md
 TaskStatus CalcDiffFluxes(StateDescriptor *hydro_pkg, MeshData<Real> *md);
 
diff --git a/src/hydro/diffusion/resistivity.cpp b/src/hydro/diffusion/resistivity.cpp
new file mode 100644
index 00000000..da8bf9ad
--- /dev/null
+++ b/src/hydro/diffusion/resistivity.cpp
@@ -0,0 +1,240 @@
+//========================================================================================
+// AthenaPK - a performance portable block structured AMR astrophysical MHD code.
+// Copyright (c) 2024, Athena-Parthenon Collaboration. All rights reserved.
+// Licensed under the 3-clause BSD License, see LICENSE file for details
+//========================================================================================
+//! \file resistivity.cpp
+//! \brief
+
+// Parthenon headers
+#include <cmath>
+#include <parthenon/package.hpp>
+
+// AthenaPK headers
+#include "../../main.hpp"
+#include "config.hpp"
+#include "diffusion.hpp"
+#include "kokkos_abstraction.hpp"
+#include "utils/error_checking.hpp"
+
+using namespace parthenon::package::prelude;
+
+KOKKOS_INLINE_FUNCTION
+Real OhmicDiffusivity::Get(const Real pres, const Real rho) const {
+  if (resistivity_coeff_type_ == ResistivityCoeff::fixed) {
+    return coeff_;
+  } else if (resistivity_coeff_type_ == ResistivityCoeff::spitzer) {
+    PARTHENON_FAIL("needs impl");
+  } else {
+    PARTHENON_FAIL("Unknown Resistivity coeff");
+  }
+}
+
+Real EstimateResistivityTimestep(MeshData<Real> *md) {
+  // get to package via first block in Meshdata (which exists by construction)
+  auto hydro_pkg = md->GetBlockData(0)->GetBlockPointer()->packages.Get("Hydro");
+  const auto &prim_pack = md->PackVariables(std::vector<std::string>{"prim"});
+
+  IndexRange ib = md->GetBlockData(0)->GetBoundsI(IndexDomain::interior);
+  IndexRange jb = md->GetBlockData(0)->GetBoundsJ(IndexDomain::interior);
+  IndexRange kb = md->GetBlockData(0)->GetBoundsK(IndexDomain::interior);
+
+  Real min_dt_resist = std::numeric_limits<Real>::max();
+  const auto ndim = prim_pack.GetNdim();
+
+  Real fac = 0.5;
+  if (ndim == 2) {
+    fac = 0.25;
+  } else if (ndim == 3) {
+    fac = 1.0 / 6.0;
+  }
+
+  const auto &ohm_diff = hydro_pkg->Param<OhmicDiffusivity>("ohm_diff");
+
+  if (ohm_diff.GetType() == Resistivity::ohmic &&
+      ohm_diff.GetCoeffType() == ResistivityCoeff::fixed) {
+    // TODO(pgrete): once mindx is properly calculated before this loop, we can get rid of
+    // it entirely.
+    // Using 0.0 as parameters rho and p as they're not used anyway for a fixed coeff.
+    const auto ohm_diff_coeff = ohm_diff.Get(0.0, 0.0);
+    Kokkos::parallel_reduce(
+        "EstimateResistivityTimestep (ohmic fixed)",
+        Kokkos::MDRangePolicy<Kokkos::Rank<4>>(
+            DevExecSpace(), {0, kb.s, jb.s, ib.s},
+            {prim_pack.GetDim(5), kb.e + 1, jb.e + 1, ib.e + 1},
+            {1, 1, 1, ib.e + 1 - ib.s}),
+        KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &min_dt) {
+          const auto &coords = prim_pack.GetCoords(b);
+          min_dt =
+              fmin(min_dt, SQR(coords.Dxc<1>(k, j, i)) / (ohm_diff_coeff + TINY_NUMBER));
+          if (ndim >= 2) {
+            min_dt = fmin(min_dt,
+                          SQR(coords.Dxc<2>(k, j, i)) / (ohm_diff_coeff + TINY_NUMBER));
+          }
+          if (ndim >= 3) {
+            min_dt = fmin(min_dt,
+                          SQR(coords.Dxc<3>(k, j, i)) / (ohm_diff_coeff + TINY_NUMBER));
+          }
+        },
+        Kokkos::Min<Real>(min_dt_resist));
+  } else {
+    PARTHENON_THROW("Needs impl.");
+  }
+
+  const auto &cfl_diff = hydro_pkg->Param<Real>("cfl_diff");
+  return cfl_diff * fac * min_dt_resist;
+}
+
+//---------------------------------------------------------------------------------------
+//! Calculate isotropic resistivity with fixed coefficient
+
+void OhmicDiffFluxIsoFixed(MeshData<Real> *md) {
+  auto pmb = md->GetBlockData(0)->GetBlockPointer();
+  IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
+  IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior);
+  IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::interior);
+
+  std::vector<parthenon::MetadataFlag> flags_ind({Metadata::Independent});
+  auto cons_pack = md->PackVariablesAndFluxes(flags_ind);
+  auto hydro_pkg = pmb->packages.Get("Hydro");
+
+  auto const &prim_pack = md->PackVariables(std::vector<std::string>{"prim"});
+
+  const int ndim = pmb->pmy_mesh->ndim;
+
+  const auto &ohm_diff = hydro_pkg->Param<OhmicDiffusivity>("ohm_diff");
+  // Using fixed and uniform coefficient so it's safe to get it outside the kernel.
+  // Using 0.0 as parameters rho and p as they're not used anyway for a fixed coeff.
+  const auto eta = ohm_diff.Get(0.0, 0.0);
+
+  parthenon::par_for(
+      DEFAULT_LOOP_PATTERN, "Resist. X1 fluxes (ohmic)", DevExecSpace(), 0,
+      cons_pack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e + 1,
+      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
+        const auto &coords = prim_pack.GetCoords(b);
+        auto &cons = cons_pack(b);
+        const auto &prim = prim_pack(b);
+
+        // Face centered current densities
+        // j2 = d3B1 - d1B3
+        const auto d3B1 =
+            ndim > 2 ? (0.5 * (prim(IB1, k + 1, j, i - 1) + prim(IB1, k + 1, j, i)) -
+                        0.5 * (prim(IB1, k - 1, j, i - 1) + prim(IB1, k - 1, j, i))) /
+                           (2.0 * coords.Dxf<3, 1>(k, j, i))
+                     : 0.0;
+
+        const auto d1B3 =
+            (prim(IB3, k, j, i) - prim(IB3, k, j, i - 1)) / coords.Dxc<1>(k, j, i);
+
+        const auto j2 = d3B1 - d1B3;
+
+        // j3 = d1B2 - d2B1
+        const auto d1B2 =
+            (prim(IB2, k, j, i) - prim(IB2, k, j, i - 1)) / coords.Dxc<1>(k, j, i);
+
+        const auto d2B1 =
+            ndim > 1 ? (0.5 * (prim(IB1, k, j + 1, i - 1) + prim(IB1, k, j + 1, i)) -
+                        0.5 * (prim(IB1, k, j - 1, i - 1) + prim(IB1, k, j - 1, i))) /
+                           (2.0 * coords.Dxf<2, 1>(k, j, i))
+                     : 0.0;
+
+        const auto j3 = d1B2 - d2B1;
+
+        cons.flux(X1DIR, IB2, k, j, i) += -eta * j3;
+        cons.flux(X1DIR, IB3, k, j, i) += eta * j2;
+        cons.flux(X1DIR, IEN, k, j, i) +=
+            0.5 * eta *
+            ((prim(IB3, k, j, i - 1) + prim(IB3, k, j, i)) * j2 -
+             (prim(IB2, k, j, i - 1) + prim(IB2, k, j, i)) * j3);
+      });
+
+  if (ndim < 2) {
+    return;
+  }
+
+  parthenon::par_for(
+      DEFAULT_LOOP_PATTERN, "Resist. X2 fluxes (ohmic)", parthenon::DevExecSpace(), 0,
+      cons_pack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e + 1, ib.s, ib.e,
+      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
+        const auto &coords = prim_pack.GetCoords(b);
+        auto &cons = cons_pack(b);
+        const auto &prim = prim_pack(b);
+
+        // Face centered current densities
+        // j3 = d1B2 - d2B1
+        const auto d1B2 = (0.5 * (prim(IB2, k, j - 1, i + 1) + prim(IB2, k, j, i + 1)) -
+                           0.5 * (prim(IB2, k, j - 1, i - 1) + prim(IB2, k, j, i - 1))) /
+                          (2.0 * coords.Dxf<1, 2>(k, j, i));
+
+        const auto d2B1 =
+            (prim(IB1, k, j, i) - prim(IB1, k, j - 1, i)) / coords.Dxc<2>(k, j, i);
+
+        const auto j3 = d1B2 - d2B1;
+
+        // j1 = d2B3 - d3B2
+        const auto d2B3 =
+            (prim(IB3, k, j, i) - prim(IB3, k, j - 1, i)) / coords.Dxc<2>(k, j, i);
+
+        const auto d3B2 =
+            ndim > 2 ? (0.5 * (prim(IB2, k + 1, j - 1, i) + prim(IB2, k + 1, j, i)) -
+                        0.5 * (prim(IB2, k - 1, j - 1, i) + prim(IB2, k - 1, j, i))) /
+                           (2.0 * coords.Dxf<3, 2>(k, j, i))
+                     : 0.0;
+
+        const auto j1 = d2B3 - d3B2;
+
+        cons.flux(X2DIR, IB1, k, j, i) += eta * j3;
+        cons.flux(X2DIR, IB3, k, j, i) += -eta * j1;
+        cons.flux(X2DIR, IEN, k, j, i) +=
+            0.5 * eta *
+            ((prim(IB1, k, j - 1, i) + prim(IB1, k, j, i)) * j3 -
+             (prim(IB3, k, j - 1, i) + prim(IB3, k, j, i)) * j1);
+      });
+
+  if (ndim < 3) {
+    return;
+  }
+
+  parthenon::par_for(
+      DEFAULT_LOOP_PATTERN, "Resist. X3 fluxes (ohmic)", parthenon::DevExecSpace(), 0,
+      cons_pack.GetDim(5) - 1, kb.s, kb.e + 1, jb.s, jb.e, ib.s, ib.e,
+      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
+        const auto &coords = prim_pack.GetCoords(b);
+        auto &cons = cons_pack(b);
+        const auto &prim = prim_pack(b);
+
+        // Face centered current densities
+        // j1 = d2B3 - d3B2
+        const auto d2B3 = (0.5 * (prim(IB3, k - 1, j + 1, i) + prim(IB3, k, j + 1, i)) -
+                           0.5 * (prim(IB3, k - 1, j - 1, i) + prim(IB3, k, j - 1, i))) /
+                          (2.0 * coords.Dxf<2, 3>(k, j, i));
+
+        const auto d3B2 =
+            (prim(IB2, k, j, i) - prim(IB2, k - 1, j, i)) / coords.Dxc<3>(k, j, i);
+
+        const auto j1 = d2B3 - d3B2;
+
+        // j2 = d3B1 - d1B3
+        const auto d3B1 =
+            (prim(IB1, k, j, i) - prim(IB1, k - 1, j, i)) / coords.Dxc<3>(k, j, i);
+
+        const auto d1B3 = (0.5 * (prim(IB3, k - 1, j, i + 1) + prim(IB3, k, j, i + 1)) -
+                           0.5 * (prim(IB3, k - 1, j, i - 1) + prim(IB3, k, j, i - 1))) /
+                          (2.0 * coords.Dxf<1, 3>(k, j, i));
+
+        const auto j2 = d3B1 - d1B3;
+
+        cons.flux(X3DIR, IB1, k, j, i) += -eta * j2;
+        cons.flux(X3DIR, IB2, k, j, i) += eta * j1;
+        cons.flux(X3DIR, IEN, k, j, i) +=
+            0.5 * eta *
+            ((prim(IB2, k - 1, j, i) + prim(IB2, k, j, i)) * j1 -
+             (prim(IB1, k - 1, j, i) + prim(IB1, k, j, i)) * j2);
+      });
+}
+
+//---------------------------------------------------------------------------------------
+//! TODO(pgrete) Calculate Ohmic diffusion, general case, e.g., with varying (Spitzer)
+//! coefficient
+
+void OhmicDiffFluxGeneral(MeshData<Real> *md) { PARTHENON_THROW("Needs impl."); }
\ No newline at end of file
diff --git a/src/hydro/diffusion/viscosity.cpp b/src/hydro/diffusion/viscosity.cpp
new file mode 100644
index 00000000..f222c869
--- /dev/null
+++ b/src/hydro/diffusion/viscosity.cpp
@@ -0,0 +1,295 @@
+//========================================================================================
+// AthenaPK - a performance portable block structured AMR astrophysical MHD code.
+// Copyright (c) 2021-2024, Athena-Parthenon Collaboration. All rights reserved.
+// Licensed under the 3-clause BSD License, see LICENSE file for details
+//========================================================================================
+// AthenaXXX astrophysical plasma code
+// Copyright(C) 2020 James M. Stone <jmstone@ias.edu> and the Athena code team
+// Licensed under the 3-clause BSD License (the "LICENSE")
+//========================================================================================
+//! \file viscosity.cpp
+//! \brief
+
+// Parthenon headers
+#include <cmath>
+#include <parthenon/package.hpp>
+
+// AthenaPK headers
+#include "../../main.hpp"
+#include "config.hpp"
+#include "diffusion.hpp"
+#include "kokkos_abstraction.hpp"
+#include "utils/error_checking.hpp"
+
+using namespace parthenon::package::prelude;
+
+KOKKOS_INLINE_FUNCTION
+Real MomentumDiffusivity::Get(const Real pres, const Real rho) const {
+  if (viscosity_coeff_type_ == ViscosityCoeff::fixed) {
+    return coeff_;
+  } else {
+    PARTHENON_FAIL("Unknown viscosity coeff");
+  }
+}
+
+Real EstimateViscosityTimestep(MeshData<Real> *md) {
+  // get to package via first block in Meshdata (which exists by construction)
+  auto hydro_pkg = md->GetBlockData(0)->GetBlockPointer()->packages.Get("Hydro");
+  const auto &prim_pack = md->PackVariables(std::vector<std::string>{"prim"});
+
+  IndexRange ib = md->GetBlockData(0)->GetBoundsI(IndexDomain::interior);
+  IndexRange jb = md->GetBlockData(0)->GetBoundsJ(IndexDomain::interior);
+  IndexRange kb = md->GetBlockData(0)->GetBoundsK(IndexDomain::interior);
+
+  Real min_dt_visc = std::numeric_limits<Real>::max();
+  const auto ndim = prim_pack.GetNdim();
+
+  Real fac = 0.5;
+  if (ndim == 2) {
+    fac = 0.25;
+  } else if (ndim == 3) {
+    fac = 1.0 / 6.0;
+  }
+
+  const auto gm1 = hydro_pkg->Param<Real>("AdiabaticIndex");
+  const auto &mom_diff = hydro_pkg->Param<MomentumDiffusivity>("mom_diff");
+
+  if (mom_diff.GetType() == Viscosity::isotropic &&
+      mom_diff.GetCoeffType() == ViscosityCoeff::fixed) {
+    // TODO(pgrete): once mindx is properly calculated before this loop, we can get rid of
+    // it entirely.
+    // Using 0.0 as parameters rho and p as they're not used anyway for a fixed coeff.
+    const auto mom_diff_coeff = mom_diff.Get(0.0, 0.0);
+    Kokkos::parallel_reduce(
+        "EstimateViscosityTimestep (iso fixed)",
+        Kokkos::MDRangePolicy<Kokkos::Rank<4>>(
+            DevExecSpace(), {0, kb.s, jb.s, ib.s},
+            {prim_pack.GetDim(5), kb.e + 1, jb.e + 1, ib.e + 1},
+            {1, 1, 1, ib.e + 1 - ib.s}),
+        KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &min_dt) {
+          const auto &coords = prim_pack.GetCoords(b);
+          min_dt =
+              fmin(min_dt, SQR(coords.Dxc<1>(k, j, i)) / (mom_diff_coeff + TINY_NUMBER));
+          if (ndim >= 2) {
+            min_dt = fmin(min_dt,
+                          SQR(coords.Dxc<2>(k, j, i)) / (mom_diff_coeff + TINY_NUMBER));
+          }
+          if (ndim >= 3) {
+            min_dt = fmin(min_dt,
+                          SQR(coords.Dxc<3>(k, j, i)) / (mom_diff_coeff + TINY_NUMBER));
+          }
+        },
+        Kokkos::Min<Real>(min_dt_visc));
+  } else {
+    PARTHENON_THROW("Needs impl.");
+  }
+
+  const auto &cfl_diff = hydro_pkg->Param<Real>("cfl_diff");
+  return cfl_diff * fac * min_dt_visc;
+}
+
+//---------------------------------------------------------------------------------------
+//! Calculate isotropic viscosity with fixed coefficient
+
+void MomentumDiffFluxIsoFixed(MeshData<Real> *md) {
+  auto pmb = md->GetBlockData(0)->GetBlockPointer();
+  IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
+  IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior);
+  IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::interior);
+
+  std::vector<parthenon::MetadataFlag> flags_ind({Metadata::Independent});
+  auto cons_pack = md->PackVariablesAndFluxes(flags_ind);
+  auto hydro_pkg = pmb->packages.Get("Hydro");
+
+  auto const &prim_pack = md->PackVariables(std::vector<std::string>{"prim"});
+
+  const int ndim = pmb->pmy_mesh->ndim;
+
+  const auto &mom_diff = hydro_pkg->Param<MomentumDiffusivity>("mom_diff");
+  // Using fixed and uniform coefficient so it's safe to get it outside the kernel.
+  // Using 0.0 as parameters rho and p as they're not used anyway for a fixed coeff.
+  const auto nu = mom_diff.Get(0.0, 0.0);
+
+  const int scratch_level =
+      hydro_pkg->Param<int>("scratch_level"); // 0 is actual scratch (tiny); 1 is HBM
+  const int nx1 = pmb->cellbounds.ncellsi(IndexDomain::entire);
+
+  size_t scratch_size_in_bytes = parthenon::ScratchPad1D<Real>::shmem_size(nx1) * 3;
+
+  parthenon::par_for_outer(
+      DEFAULT_OUTER_LOOP_PATTERN, "Visc. X1 fluxes (iso)", DevExecSpace(),
+      scratch_size_in_bytes, scratch_level, 0, cons_pack.GetDim(5) - 1, kb.s, kb.e, jb.s,
+      jb.e,
+      KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k, const int j) {
+        const auto &coords = prim_pack.GetCoords(b);
+        auto &cons = cons_pack(b);
+        const auto &prim = prim_pack(b);
+        parthenon::ScratchPad1D<Real> fvx(member.team_scratch(scratch_level), nx1);
+        parthenon::ScratchPad1D<Real> fvy(member.team_scratch(scratch_level), nx1);
+        parthenon::ScratchPad1D<Real> fvz(member.team_scratch(scratch_level), nx1);
+
+        // Add [2(dVx/dx)-(2/3)dVx/dx, dVy/dx, dVz/dx]
+        par_for_inner(member, ib.s, ib.e + 1, [&](const int i) {
+          fvx(i) = 4.0 * (prim(IV1, k, j, i) - prim(IV1, k, j, i - 1)) /
+                   (3.0 * coords.Dxc<1>(i));
+          fvy(i) = (prim(IV2, k, j, i) - prim(IV2, k, j, i - 1)) / coords.Dxc<1>(i);
+          fvz(i) = (prim(IV3, k, j, i) - prim(IV3, k, j, i - 1)) / coords.Dxc<1>(i);
+        });
+        member.team_barrier();
+
+        // In 2D/3D Add [(-2/3)dVy/dy, dVx/dy, 0]
+        if (ndim > 1) {
+          par_for_inner(member, ib.s, ib.e + 1, [&](const int i) {
+            fvx(i) -= ((prim(IV2, k, j + 1, i) + prim(IV2, k, j + 1, i - 1)) -
+                       (prim(IV2, k, j - 1, i) + prim(IV2, k, j - 1, i - 1))) /
+                      (6.0 * coords.Dxc<2>(j));
+            fvy(i) += ((prim(IV1, k, j + 1, i) + prim(IV1, k, j + 1, i - 1)) -
+                       (prim(IV1, k, j - 1, i) + prim(IV1, k, j - 1, i - 1))) /
+                      (4.0 * coords.Dxc<2>(j));
+          });
+          member.team_barrier();
+        }
+
+        // In 3D Add [(-2/3)dVz/dz, 0,  dVx/dz]
+        if (ndim > 2) {
+          par_for_inner(member, ib.s, ib.e + 1, [&](const int i) {
+            fvx(i) -= ((prim(IV3, k + 1, j, i) + prim(IV3, k + 1, j, i - 1)) -
+                       (prim(IV3, k - 1, j, i) + prim(IV3, k - 1, j, i - 1))) /
+                      (6.0 * coords.Dxc<3>(k));
+            fvz(i) += ((prim(IV1, k + 1, j, i) + prim(IV1, k + 1, j, i - 1)) -
+                       (prim(IV1, k - 1, j, i) + prim(IV1, k - 1, j, i - 1))) /
+                      (4.0 * coords.Dxc<3>(k));
+          });
+          member.team_barrier();
+        }
+
+        // Sum viscous fluxes into fluxes of conserved variables; including energy fluxes
+        par_for_inner(member, ib.s, ib.e + 1, [&](const int i) {
+          Real nud = 0.5 * nu * (prim(IDN, k, j, i) + prim(IDN, k, j, i - 1));
+          cons.flux(X1DIR, IV1, k, j, i) -= nud * fvx(i);
+          cons.flux(X1DIR, IV2, k, j, i) -= nud * fvy(i);
+          cons.flux(X1DIR, IV3, k, j, i) -= nud * fvz(i);
+          cons.flux(X1DIR, IEN, k, j, i) -=
+              0.5 * nud *
+              ((prim(IV1, k, j, i - 1) + prim(IV1, k, j, i)) * fvx(i) +
+               (prim(IV2, k, j, i - 1) + prim(IV2, k, j, i)) * fvy(i) +
+               (prim(IV3, k, j, i - 1) + prim(IV3, k, j, i)) * fvz(i));
+        });
+      });
+
+  if (ndim < 2) {
+    return;
+  }
+  /* Compute viscous fluxes in 2-direction  --------------------------------------*/
+  parthenon::par_for_outer(
+      DEFAULT_OUTER_LOOP_PATTERN, "Visc. X2 fluxes (iso)", parthenon::DevExecSpace(),
+      scratch_size_in_bytes, scratch_level, 0, cons_pack.GetDim(5) - 1, kb.s, kb.e, jb.s,
+      jb.e + 1,
+      KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k, const int j) {
+        const auto &coords = prim_pack.GetCoords(b);
+        auto &cons = cons_pack(b);
+        const auto &prim = prim_pack(b);
+        parthenon::ScratchPad1D<Real> fvx(member.team_scratch(scratch_level), nx1);
+        parthenon::ScratchPad1D<Real> fvy(member.team_scratch(scratch_level), nx1);
+        parthenon::ScratchPad1D<Real> fvz(member.team_scratch(scratch_level), nx1);
+
+        // Add [(dVx/dy+dVy/dx), 2(dVy/dy)-(2/3)(dVx/dx+dVy/dy), dVz/dy]
+        par_for_inner(member, ib.s, ib.e, [&](const int i) {
+          fvx(i) = (prim(IV1, k, j, i) - prim(IV1, k, j - 1, i)) / coords.Dxc<2>(j) +
+                   ((prim(IV2, k, j, i + 1) + prim(IV2, k, j - 1, i + 1)) -
+                    (prim(IV2, k, j, i - 1) + prim(IV2, k, j - 1, i - 1))) /
+                       (4.0 * coords.Dxc<1>(i));
+          fvy(i) = (prim(IV2, k, j, i) - prim(IV2, k, j - 1, i)) * 4.0 /
+                       (3.0 * coords.Dxc<2>(j)) -
+                   ((prim(IV1, k, j, i + 1) + prim(IV1, k, j - 1, i + 1)) -
+                    (prim(IV1, k, j, i - 1) + prim(IV1, k, j - 1, i - 1))) /
+                       (6.0 * coords.Dxc<1>(i));
+          fvz(i) = (prim(IV3, k, j, i) - prim(IV3, k, j - 1, i)) / coords.Dxc<2>(j);
+        });
+        member.team_barrier();
+
+        // In 3D Add [0, (-2/3)dVz/dz, dVy/dz]
+        if (ndim > 2) {
+          par_for_inner(member, ib.s, ib.e, [&](const int i) {
+            fvy(i) -= ((prim(IV3, k + 1, j, i) + prim(IV3, k + 1, j - 1, i)) -
+                       (prim(IV3, k - 1, j, i) + prim(IV3, k - 1, j - 1, i))) /
+                      (6.0 * coords.Dxc<3>(k));
+            fvz(i) += ((prim(IV2, k + 1, j, i) + prim(IV2, k + 1, j - 1, i)) -
+                       (prim(IV2, k - 1, j, i) + prim(IV2, k - 1, j - 1, i))) /
+                      (4.0 * coords.Dxc<3>(k));
+          });
+          member.team_barrier();
+        }
+
+        // Sum viscous fluxes into fluxes of conserved variables; including energy fluxes
+        par_for_inner(member, ib.s, ib.e, [&](const int i) {
+          Real nud = 0.5 * nu * (prim(IDN, k, j, i) + prim(IDN, k, j - 1, i));
+          cons.flux(X2DIR, IV1, k, j, i) -= nud * fvx(i);
+          cons.flux(X2DIR, IV2, k, j, i) -= nud * fvy(i);
+          cons.flux(X2DIR, IV3, k, j, i) -= nud * fvz(i);
+          cons.flux(X2DIR, IEN, k, j, i) -=
+              0.5 * nud *
+              ((prim(IV1, k, j - 1, i) + prim(IV1, k, j, i)) * fvx(i) +
+               (prim(IV2, k, j - 1, i) + prim(IV2, k, j, i)) * fvy(i) +
+               (prim(IV3, k, j - 1, i) + prim(IV3, k, j, i)) * fvz(i));
+        });
+      });
+  /* Compute heat fluxes in 3-direction, 3D problem ONLY  ---------------------*/
+  if (ndim < 3) {
+    return;
+  }
+
+  parthenon::par_for_outer(
+      DEFAULT_OUTER_LOOP_PATTERN, "Visc. X3 fluxes (iso)", parthenon::DevExecSpace(),
+      scratch_size_in_bytes, scratch_level, 0, cons_pack.GetDim(5) - 1, kb.s, kb.e + 1,
+      jb.s, jb.e,
+      KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k, const int j) {
+        const auto &coords = prim_pack.GetCoords(b);
+        auto &cons = cons_pack(b);
+        const auto &prim = prim_pack(b);
+
+        parthenon::ScratchPad1D<Real> fvx(member.team_scratch(scratch_level), nx1);
+        parthenon::ScratchPad1D<Real> fvy(member.team_scratch(scratch_level), nx1);
+        parthenon::ScratchPad1D<Real> fvz(member.team_scratch(scratch_level), nx1);
+
+        // Add [(dVx/dz+dVz/dx), (dVy/dz+dVz/dy), 2(dVz/dz)-(2/3)(dVx/dx+dVy/dy+dVz/dz)]
+        par_for_inner(member, ib.s, ib.e, [&](const int i) {
+          fvx(i) = (prim(IV1, k, j, i) - prim(IV1, k - 1, j, i)) / coords.Dxc<3>(k) +
+                   ((prim(IV3, k, j, i + 1) + prim(IV3, k - 1, j, i + 1)) -
+                    (prim(IV3, k, j, i - 1) + prim(IV3, k - 1, j, i - 1))) /
+                       (4.0 * coords.Dxc<1>(i));
+          fvy(i) = (prim(IV2, k, j, i) - prim(IV2, k - 1, j, i)) / coords.Dxc<3>(k) +
+                   ((prim(IV3, k, j + 1, i) + prim(IV3, k - 1, j + 1, i)) -
+                    (prim(IV3, k, j - 1, i) + prim(IV3, k - 1, j - 1, i))) /
+                       (4.0 * coords.Dxc<2>(j));
+          fvz(i) = (prim(IV3, k, j, i) - prim(IV3, k - 1, j, i)) * 4.0 /
+                       (3.0 * coords.Dxc<3>(k)) -
+                   ((prim(IV1, k, j, i + 1) + prim(IV1, k - 1, j, i + 1)) -
+                    (prim(IV1, k, j, i - 1) + prim(IV1, k - 1, j, i - 1))) /
+                       (6.0 * coords.Dxc<1>(i)) -
+                   ((prim(IV2, k, j + 1, i) + prim(IV2, k - 1, j + 1, i)) -
+                    (prim(IV2, k, j - 1, i) + prim(IV2, k - 1, j - 1, i))) /
+                       (6.0 * coords.Dxc<2>(j));
+        });
+        member.team_barrier();
+
+        // Sum viscous fluxes into fluxes of conserved variables; including energy fluxes
+        par_for_inner(member, ib.s, ib.e, [&](const int i) {
+          Real nud = 0.5 * nu * (prim(IDN, k, j, i) + prim(IDN, k - 1, j, i));
+          cons.flux(X3DIR, IV1, k, j, i) -= nud * fvx(i);
+          cons.flux(X3DIR, IV2, k, j, i) -= nud * fvy(i);
+          cons.flux(X3DIR, IV3, k, j, i) -= nud * fvz(i);
+          cons.flux(X3DIR, IEN, k, j, i) -=
+              0.5 * nud *
+              ((prim(IV1, k - 1, j, i) + prim(IV1, k, j, i)) * fvx(i) +
+               (prim(IV2, k - 1, j, i) + prim(IV2, k, j, i)) * fvy(i) +
+               (prim(IV3, k - 1, j, i) + prim(IV3, k, j, i)) * fvz(i));
+        });
+      });
+}
+
+//---------------------------------------------------------------------------------------
+//! TODO(pgrete) Calculate momentum diffusion, general case, i.e., anisotropic and/or with
+//! varying coefficient
+
+void MomentumDiffFluxGeneral(MeshData<Real> *md) { PARTHENON_THROW("Needs impl."); }
diff --git a/src/hydro/hydro.cpp b/src/hydro/hydro.cpp
index 80e1c5a5..81ebc1c8 100644
--- a/src/hydro/hydro.cpp
+++ b/src/hydro/hydro.cpp
@@ -4,6 +4,7 @@
 // Licensed under the BSD 3-Clause License (the "LICENSE").
 //========================================================================================
 
+#include <algorithm>
 #include <limits>
 #include <memory>
 #include <string>
@@ -29,6 +30,7 @@
 #include "diffusion/diffusion.hpp"
 #include "glmmhd/glmmhd.hpp"
 #include "hydro.hpp"
+#include "interface/params.hpp"
 #include "outputs/outputs.hpp"
 #include "prolongation/custom_ops.hpp"
 #include "rsolvers/rsolvers.hpp"
@@ -55,32 +57,85 @@ parthenon::Packages_t ProcessPackages(std::unique_ptr<ParameterInput> &pin) {
   return packages;
 }
 
+// Calculate mininum dx, which is used in calculating the divergence cleaning speed c_h
+// TODO(PG) eventually move to calculating the timestep once the timestep calc
+// has been moved to be done before Step()
+Real CalculateGlobalMinDx(MeshData<Real> *md) {
+  auto *pmb = md->GetBlockData(0)->GetBlockPointer();
+  auto hydro_pkg = pmb->packages.Get("Hydro");
+
+  const auto &prim_pack = md->PackVariables(std::vector<std::string>{"prim"});
+
+  IndexRange ib = md->GetBlockData(0)->GetBoundsI(IndexDomain::interior);
+  IndexRange jb = md->GetBlockData(0)->GetBoundsJ(IndexDomain::interior);
+  IndexRange kb = md->GetBlockData(0)->GetBoundsK(IndexDomain::interior);
+
+  Real mindx = std::numeric_limits<Real>::max();
+
+  bool nx2 = prim_pack.GetDim(2) > 1;
+  bool nx3 = prim_pack.GetDim(3) > 1;
+  pmb->par_reduce(
+      "CalculateGlobalMinDx", 0, prim_pack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
+      ib.e,
+      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &lmindx) {
+        const auto &coords = prim_pack.GetCoords(b);
+        lmindx = fmin(lmindx, coords.Dxc<1>(k, j, i));
+        if (nx2) {
+          lmindx = fmin(lmindx, coords.Dxc<2>(k, j, i));
+        }
+        if (nx3) {
+          lmindx = fmin(lmindx, coords.Dxc<3>(k, j, i));
+        }
+      },
+      Kokkos::Min<Real>(mindx));
+
+  return mindx;
+}
+
 // Using this per cycle function to populate various variables in
 // Params that require global reduction *and* need to be set/known when
 // the task list is constructed (versus when the task list is being executed).
 // TODO(next person touching this function): If more/separate feature are required
 // please separate concerns.
 void PreStepMeshUserWorkInLoop(Mesh *pmesh, ParameterInput *pin, SimTime &tm) {
-  auto hydro_pkg = pmesh->block_list[0]->packages.Get("Hydro");
-  const auto num_partitions = pmesh->DefaultNumPartitions();
-
-  if ((hydro_pkg->Param<DiffInt>("diffint") == DiffInt::rkl2) &&
-      (hydro_pkg->Param<Conduction>("conduction") != Conduction::none)) {
-    auto dt_diff = std::numeric_limits<Real>::max();
-    for (auto i = 0; i < num_partitions; i++) {
-      auto &md = pmesh->mesh_data.GetOrAdd("base", i);
-
-      dt_diff = std::min(dt_diff, EstimateConductionTimestep(md.get()));
+  auto hydro_pkg = pmesh->packages.Get("Hydro");
+
+  // Calculate hyperbolic divergence cleaning speed
+  // TODO(pgrete) Calculating mindx is only required after remeshing. Need to
+  // find a clean solution for this one-off global reduction.
+  if (hydro_pkg->Param<bool>("calc_c_h") ||
+      hydro_pkg->Param<DiffInt>("diffint") != DiffInt::none) {
+
+    Real mindx = std::numeric_limits<Real>::max();
+    // Going over default partitions. Not using a (new) single partition containing
+    // all blocks here as this (default) split is also used main Step() function and
+    // thus does not create an overhead (such as creating a new MeshBlockPack that is just
+    // used here). All partitions are executed sequentially. Given that a par_reduce to a
+    // host var is blocking it's save to dirctly use the return value.
+    const int num_partitions = pmesh->DefaultNumPartitions();
+    for (int i = 0; i < num_partitions; i++) {
+      auto &mu0 = pmesh->mesh_data.GetOrAdd("base", i);
+      mindx = std::min(mindx, CalculateGlobalMinDx(mu0.get()));
     }
 #ifdef MPI_PARALLEL
-    PARTHENON_MPI_CHECK(MPI_Allreduce(MPI_IN_PLACE, &dt_diff, 1, MPI_PARTHENON_REAL,
-                                      MPI_MIN, MPI_COMM_WORLD));
+    Real mins[3];
+    mins[0] = mindx;
+    mins[1] = hydro_pkg->Param<Real>("dt_hyp");
+    mins[2] = hydro_pkg->Param<Real>("dt_diff");
+    PARTHENON_MPI_CHECK(MPI_Allreduce(MPI_IN_PLACE, mins, 3, MPI_PARTHENON_REAL, MPI_MIN,
+                                      MPI_COMM_WORLD));
+
+    hydro_pkg->UpdateParam("mindx", mins[0]);
+    hydro_pkg->UpdateParam("dt_hyp", mins[1]);
+    hydro_pkg->UpdateParam("dt_diff", mins[2]);
+#else
+    hydro_pkg->UpdateParam("mindx", mindx);
+    // dt_hyp and dt_diff are already set directly in Params when they're calculated
 #endif
-    hydro_pkg->UpdateParam("dt_diff", dt_diff);
-    const auto max_dt_ratio = hydro_pkg->Param<Real>("rkl2_max_dt_ratio");
-    if (max_dt_ratio > 0.0 && tm.dt / dt_diff > max_dt_ratio) {
-      tm.dt = max_dt_ratio * dt_diff;
-    }
+    // Finally update c_h
+    const auto &cfl_hyp = hydro_pkg->Param<Real>("cfl");
+    const auto &dt_hyp = hydro_pkg->Param<Real>("dt_hyp");
+    hydro_pkg->UpdateParam("c_h", cfl_hyp * mindx / dt_hyp);
   }
 }
 
@@ -237,17 +292,23 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin) {
     auto glmmhd_alpha = pin->GetOrAddReal("hydro", "glmmhd_alpha", 0.1);
     pkg->AddParam<Real>("glmmhd_alpha", glmmhd_alpha);
     calc_c_h = true;
-    pkg->AddParam<Real>("c_h", 0.0, true); // hyperbolic divergence cleaning speed
-    // global minimum dx (used to calc c_h)
-    pkg->AddParam<Real>("mindx", std::numeric_limits<Real>::max(), true);
-    // hyperbolic timestep constraint
-    pkg->AddParam<Real>("dt_hyp", std::numeric_limits<Real>::max(), true);
   } else {
     PARTHENON_FAIL("AthenaPK hydro: Unknown fluid method.");
   }
   pkg->AddParam<>("fluid", fluid);
   pkg->AddParam<>("nhydro", nhydro);
   pkg->AddParam<>("calc_c_h", calc_c_h);
+  // Following params should (currently) be present independent of solver because
+  // they're all used in the main loop.
+  // TODO(pgrete) think about which approach (selective versus always is preferable)
+  pkg->AddParam<Real>(
+      "c_h", 0.0, Params::Mutability::Mutable); // hyperbolic divergence cleaning speed
+  // global minimum dx (used to calc c_h)
+  pkg->AddParam<Real>("mindx", std::numeric_limits<Real>::max(),
+                      Params::Mutability::Mutable);
+  // hyperbolic timestep constraint
+  pkg->AddParam<Real>("dt_hyp", std::numeric_limits<Real>::max(),
+                      Params::Mutability::Mutable);
 
   const auto recon_str = pin->GetString("hydro", "reconstruction");
   int recon_need_nghost = 3; // largest number for the choices below
@@ -544,6 +605,67 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin) {
     }
     pkg->AddParam<>("conduction", conduction);
 
+    auto viscosity = Viscosity::none;
+    auto viscosity_str = pin->GetOrAddString("diffusion", "viscosity", "none");
+    if (viscosity_str == "isotropic") {
+      viscosity = Viscosity::isotropic;
+    } else if (viscosity_str != "none") {
+      PARTHENON_FAIL("Unknown viscosity method. Options are: none, isotropic");
+    }
+    // If viscosity is enabled, process supported coefficients
+    if (viscosity != Viscosity::none) {
+      auto viscosity_coeff_str =
+          pin->GetOrAddString("diffusion", "viscosity_coeff", "none");
+      auto viscosity_coeff = ViscosityCoeff::none;
+
+      if (viscosity_coeff_str == "fixed") {
+        viscosity_coeff = ViscosityCoeff::fixed;
+        Real mom_diff_coeff_code = pin->GetReal("diffusion", "mom_diff_coeff_code");
+        auto mom_diff = MomentumDiffusivity(viscosity, viscosity_coeff,
+                                            mom_diff_coeff_code, 0.0, 0.0, 0.0);
+        pkg->AddParam<>("mom_diff", mom_diff);
+
+      } else {
+        PARTHENON_FAIL("Viscosity is enabled but no coefficient is set. Please "
+                       "set diffusion/viscosity_coeff to 'fixed' and "
+                       "diffusion/mom_diff_coeff_code to the desired value.");
+      }
+    }
+    pkg->AddParam<>("viscosity", viscosity);
+
+    auto resistivity = Resistivity::none;
+    auto resistivity_str = pin->GetOrAddString("diffusion", "resistivity", "none");
+    if (resistivity_str == "ohmic") {
+      resistivity = Resistivity::ohmic;
+    } else if (resistivity_str != "none") {
+      PARTHENON_FAIL("Unknown resistivity method. Options are: none, ohmic");
+    }
+    // If resistivity is enabled, process supported coefficients
+    if (resistivity != Resistivity::none) {
+      auto resistivity_coeff_str =
+          pin->GetOrAddString("diffusion", "resistivity_coeff", "none");
+      auto resistivity_coeff = ResistivityCoeff::none;
+
+      if (resistivity_coeff_str == "spitzer") {
+        // If this is implemented, check how the Spitzer coeff for thermal conduction is
+        // handled.
+        PARTHENON_FAIL("needs impl");
+
+      } else if (resistivity_coeff_str == "fixed") {
+        resistivity_coeff = ResistivityCoeff::fixed;
+        Real ohm_diff_coeff_code = pin->GetReal("diffusion", "ohm_diff_coeff_code");
+        auto ohm_diff = OhmicDiffusivity(resistivity, resistivity_coeff,
+                                         ohm_diff_coeff_code, 0.0, 0.0, 0.0);
+        pkg->AddParam<>("ohm_diff", ohm_diff);
+
+      } else {
+        PARTHENON_FAIL("Resistivity is enabled but no coefficient is set. Please "
+                       "set diffusion/resistivity_coeff to 'fixed' and "
+                       "diffusion/ohm_diff_coeff_code to the desired value.");
+      }
+    }
+    pkg->AddParam<>("resistivity", resistivity);
+
     auto diffint_str = pin->GetOrAddString("diffusion", "integrator", "none");
     auto diffint = DiffInt::none;
     if (diffint_str == "unsplit") {
@@ -557,8 +679,13 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin) {
                      "Options are: none, unsplit, rkl2");
     }
     if (diffint != DiffInt::none) {
-      pkg->AddParam<Real>("dt_diff", 0.0, true); // diffusive timestep constraint
+      // As in Athena++ a cfl safety factor is also applied to the theoretical limit.
+      // By default it is equal to the hyperbolic cfl.
+      auto cfl_diff = pin->GetOrAddReal("diffusion", "cfl", pkg->Param<Real>("cfl"));
+      pkg->AddParam<>("cfl_diff", cfl_diff);
     }
+    pkg->AddParam<Real>("dt_diff", std::numeric_limits<Real>::max(),
+                        Params::Mutability::Mutable); // diffusive timestep constraint
     pkg->AddParam<>("diffint", diffint);
 
     if (fluid == Fluid::euler) {
@@ -788,9 +915,12 @@ Real EstimateTimestep(MeshData<Real> *md) {
   // get to package via first block in Meshdata (which exists by construction)
   auto hydro_pkg = md->GetBlockData(0)->GetBlockPointer()->packages.Get("Hydro");
   auto min_dt = std::numeric_limits<Real>::max();
+  auto dt_hyp = std::numeric_limits<Real>::max();
 
-  if (hydro_pkg->Param<bool>("calc_dt_hyp")) {
-    min_dt = std::min(min_dt, EstimateHyperbolicTimestep<fluid>(md));
+  const auto calc_dt_hyp = hydro_pkg->Param<bool>("calc_dt_hyp");
+  if (calc_dt_hyp) {
+    dt_hyp = EstimateHyperbolicTimestep<fluid>(md);
+    min_dt = std::min(min_dt, dt_hyp);
   }
 
   const auto &enable_cooling = hydro_pkg->Param<Cooling>("enable_cooling");
@@ -802,10 +932,34 @@ Real EstimateTimestep(MeshData<Real> *md) {
     min_dt = std::min(min_dt, tabular_cooling.EstimateTimeStep(md));
   }
 
-  // For RKL2 STS, the diffusive timestep is calculated separately in the driver
-  if ((hydro_pkg->Param<DiffInt>("diffint") == DiffInt::unsplit) &&
-      (hydro_pkg->Param<Conduction>("conduction") != Conduction::none)) {
-    min_dt = std::min(min_dt, EstimateConductionTimestep(md));
+  auto dt_diff = std::numeric_limits<Real>::max();
+  if (hydro_pkg->Param<DiffInt>("diffint") != DiffInt::none) {
+    if (hydro_pkg->Param<Conduction>("conduction") != Conduction::none) {
+      dt_diff = std::min(dt_diff, EstimateConductionTimestep(md));
+    }
+    if (hydro_pkg->Param<Viscosity>("viscosity") != Viscosity::none) {
+      dt_diff = std::min(dt_diff, EstimateViscosityTimestep(md));
+    }
+    if (hydro_pkg->Param<Resistivity>("resistivity") != Resistivity::none) {
+      dt_diff = std::min(dt_diff, EstimateResistivityTimestep(md));
+    }
+
+    // For unsplit ingegration use strict limit
+    if (hydro_pkg->Param<DiffInt>("diffint") == DiffInt::unsplit) {
+      min_dt = std::min(min_dt, dt_diff);
+      // and for RKL2 integration use limit taking into account the maxium ratio
+      // or not constrain limit further (which is why RKL2 is there in first place)
+    } else if (hydro_pkg->Param<DiffInt>("diffint") == DiffInt::rkl2) {
+      const auto max_dt_ratio = hydro_pkg->Param<Real>("rkl2_max_dt_ratio");
+      if (max_dt_ratio > 0.0 && dt_hyp / dt_diff > max_dt_ratio) {
+        min_dt = std::min(min_dt, max_dt_ratio * dt_diff);
+      }
+    } else {
+      PARTHENON_THROW("Looks like a a new diffusion integrator was implemented without "
+                      "taking into accout timestep contstraints yet.");
+    }
+    auto dt_diff_param = hydro_pkg->Param<Real>("dt_diff");
+    hydro_pkg->UpdateParam("dt_diff", std::min(dt_diff, dt_diff_param));
   }
 
   if (ProblemEstimateTimestep != nullptr) {
@@ -1123,10 +1277,10 @@ TaskStatus FirstOrderFluxCorrect(MeshData<Real> *u0_data, MeshData<Real> *u1_dat
           const auto &u0_prim = u0_prim_pack(b);
           auto &u0_cons = u0_cons_pack(b);
 
-          // In principle, the u_cons.fluxes could be updated in parallel by a different
-          // thread resulting in a race conditon here.
-          // However, if the fluxes of a cell have been updated (anywhere) then the entire
-          // kernel will be called again anyway, and, at that point the already fixed
+          // In principle, the u_cons.fluxes could be updated in parallel by a
+          // different thread resulting in a race conditon here. However, if the
+          // fluxes of a cell have been updated (anywhere) then the entire kernel will
+          // be called again anyway, and, at that point the already fixed
           // u0_cons.fluxes will automaticlly be used here.
           Real new_cons[NVAR];
           for (auto v = 0; v < NVAR; v++) {
@@ -1154,13 +1308,13 @@ TaskStatus FirstOrderFluxCorrect(MeshData<Real> *u0_data, MeshData<Real> *u1_dat
             lnum_need_floor += 1;
             return;
           }
-          // In principle, there could be a racecondion as this loop goes over all k,j,i
-          // and we updating the i+1 flux here.
-          // However, the results are idential because u0_prim is never updated in this
-          // kernel so we don't worry about it.
-          // TODO(pgrete) as we need to keep the function signature idential for now (due
-          // to Cuda compiler bug) we could potentially template these function and get
-          // rid of the `if constexpr`
+          // In principle, there could be a racecondion as this loop goes over all
+          // k,j,i and we updating the i+1 flux here. However, the results are
+          // idential because u0_prim is never updated in this kernel so we don't
+          // worry about it.
+          // TODO(pgrete) as we need to keep the function signature idential for now
+          // (due to Cuda compiler bug) we could potentially template these function
+          // and get rid of the `if constexpr`
           riemann.Solve(eos, k, j, i, IV1, u0_prim, u0_cons, c_h);
           riemann.Solve(eos, k, j, i + 1, IV1, u0_prim, u0_cons, c_h);
 
@@ -1177,7 +1331,8 @@ TaskStatus FirstOrderFluxCorrect(MeshData<Real> *u0_data, MeshData<Real> *u1_dat
         Kokkos::Sum<std::int64_t>(num_corrected),
         Kokkos::Sum<std::int64_t>(num_need_floor));
     // TODO(pgrete) make this optional and global (potentially store values in Params)
-    // std::cout << "[" << parthenon::Globals::my_rank << "] Attempt: " << num_attempts
+    // std::cout << "[" << parthenon::Globals::my_rank << "] Attempt: " <<
+    // num_attempts
     //           << " Corrected (center): " << num_corrected
     //           << " Failed (will rely on floor): " << num_need_floor << std::endl;
     num_attempts += 1;
diff --git a/src/hydro/hydro_driver.cpp b/src/hydro/hydro_driver.cpp
index 2337495f..d4c06ee3 100644
--- a/src/hydro/hydro_driver.cpp
+++ b/src/hydro/hydro_driver.cpp
@@ -38,46 +38,6 @@ HydroDriver::HydroDriver(ParameterInput *pin, ApplicationInput *app_in, Mesh *pm
   pin->CheckDesired("parthenon/time", "cfl");
 }
 
-// Calculate mininum dx, which is used in calculating the divergence cleaning speed c_h
-TaskStatus CalculateGlobalMinDx(MeshData<Real> *md) {
-  auto pmb = md->GetBlockData(0)->GetBlockPointer();
-  auto hydro_pkg = pmb->packages.Get("Hydro");
-
-  const auto &prim_pack = md->PackVariables(std::vector<std::string>{"prim"});
-
-  IndexRange ib = md->GetBlockData(0)->GetBoundsI(IndexDomain::interior);
-  IndexRange jb = md->GetBlockData(0)->GetBoundsJ(IndexDomain::interior);
-  IndexRange kb = md->GetBlockData(0)->GetBoundsK(IndexDomain::interior);
-
-  Real mindx = std::numeric_limits<Real>::max();
-
-  bool nx2 = prim_pack.GetDim(2) > 1;
-  bool nx3 = prim_pack.GetDim(3) > 1;
-  pmb->par_reduce(
-      "CalculateGlobalMinDx", 0, prim_pack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
-      ib.e,
-      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &lmindx) {
-        const auto &coords = prim_pack.GetCoords(b);
-        lmindx = fmin(lmindx, coords.Dxc<1>(k, j, i));
-        if (nx2) {
-          lmindx = fmin(lmindx, coords.Dxc<2>(k, j, i));
-        }
-        if (nx3) {
-          lmindx = fmin(lmindx, coords.Dxc<3>(k, j, i));
-        }
-      },
-      Kokkos::Min<Real>(mindx));
-
-  // Reduction to host var is blocking and only have one of this tasks run at the same
-  // time so modifying the package should be safe.
-  auto mindx_pkg = hydro_pkg->Param<Real>("mindx");
-  if (mindx < mindx_pkg) {
-    hydro_pkg->UpdateParam("mindx", mindx);
-  }
-
-  return TaskStatus::complete;
-}
-
 // Sets all fluxes to 0
 TaskStatus ResetFluxes(MeshData<Real> *md) {
   auto pmb = md->GetBlockData(0)->GetBlockPointer();
@@ -465,55 +425,6 @@ TaskCollection HydroDriver::MakeTaskCollection(BlockList_t &blocks, int stage) {
     }
   }
 
-  // Calculate hyperbolic divergence cleaning speed
-  // TODO(pgrete) Calculating mindx is only required after remeshing. Need to find a clean
-  // solution for this one-off global reduction.
-  if (hydro_pkg->Param<bool>("calc_c_h") && (stage == 1)) {
-    // need to make sure that there's only one region in order to MPI_reduce to work
-    TaskRegion &single_task_region = tc.AddRegion(1);
-    auto &tl = single_task_region[0];
-    // Adding one task for each partition. Not using a (new) single partition containing
-    // all blocks here as this (default) split is also used for the following tasks and
-    // thus does not create an overhead (such as creating a new MeshBlockPack that is just
-    // used here). Given that all partitions are in one task list they'll be executed
-    // sequentially. Given that a par_reduce to a host var is blocking it's also save to
-    // store the variable in the Params for now.
-    auto prev_task = none;
-    for (int i = 0; i < num_partitions; i++) {
-      auto &mu0 = pmesh->mesh_data.GetOrAdd("base", i);
-      auto new_mindx = tl.AddTask(prev_task, CalculateGlobalMinDx, mu0.get());
-      prev_task = new_mindx;
-    }
-    auto reduce_c_h = prev_task;
-#ifdef MPI_PARALLEL
-    reduce_c_h = tl.AddTask(
-        prev_task,
-        [](StateDescriptor *hydro_pkg) {
-          Real mins[2];
-          mins[0] = hydro_pkg->Param<Real>("mindx");
-          mins[1] = hydro_pkg->Param<Real>("dt_hyp");
-          PARTHENON_MPI_CHECK(MPI_Allreduce(MPI_IN_PLACE, mins, 2, MPI_PARTHENON_REAL,
-                                            MPI_MIN, MPI_COMM_WORLD));
-
-          hydro_pkg->UpdateParam("mindx", mins[0]);
-          hydro_pkg->UpdateParam("dt_hyp", mins[1]);
-          return TaskStatus::complete;
-        },
-        hydro_pkg.get());
-#endif
-    // Finally update c_h
-    auto update_c_h = tl.AddTask(
-        reduce_c_h,
-        [](StateDescriptor *hydro_pkg) {
-          const auto &mindx = hydro_pkg->Param<Real>("mindx");
-          const auto &cfl_hyp = hydro_pkg->Param<Real>("cfl");
-          const auto &dt_hyp = hydro_pkg->Param<Real>("dt_hyp");
-          hydro_pkg->UpdateParam("c_h", cfl_hyp * mindx / dt_hyp);
-          return TaskStatus::complete;
-        },
-        hydro_pkg.get());
-  }
-
   // calculate magnetic tower scaling
   if ((stage == 1) && hydro_pkg->AllParams().hasKey("magnetic_tower_power_scaling") &&
       hydro_pkg->Param<bool>("magnetic_tower_power_scaling")) {
@@ -676,21 +587,6 @@ TaskCollection HydroDriver::MakeTaskCollection(BlockList_t &blocks, int stage) {
                                         pmesh->multilevel);
   }
 
-  // Single task in single (serial) region to reset global vars used in reductions in the
-  // first stage.
-  if (stage == integrator->nstages && hydro_pkg->Param<bool>("calc_c_h")) {
-    TaskRegion &reset_reduction_vars_region = tc.AddRegion(1);
-    auto &tl = reset_reduction_vars_region[0];
-    tl.AddTask(
-        none,
-        [](StateDescriptor *hydro_pkg) {
-          hydro_pkg->UpdateParam("mindx", std::numeric_limits<Real>::max());
-          hydro_pkg->UpdateParam("dt_hyp", std::numeric_limits<Real>::max());
-          return TaskStatus::complete;
-        },
-        hydro_pkg.get());
-  }
-
   TaskRegion &single_tasklist_per_pack_region_3 = tc.AddRegion(num_partitions);
   for (int i = 0; i < num_partitions; i++) {
     auto &tl = single_tasklist_per_pack_region_3[i];
@@ -705,6 +601,26 @@ TaskCollection HydroDriver::MakeTaskCollection(BlockList_t &blocks, int stage) {
     AddSTSTasks(&tc, pmesh, blocks, 0.5 * tm.dt);
   }
 
+  // Single task in single (serial) region to reset global vars used in reductions in the
+  // first stage.
+  // TODO(pgrete) check if we logically need this reset or if we can reset within the
+  // timestep task
+  if (stage == integrator->nstages &&
+      (hydro_pkg->Param<bool>("calc_c_h") ||
+       hydro_pkg->Param<DiffInt>("diffint") != DiffInt::none)) {
+    TaskRegion &reset_reduction_vars_region = tc.AddRegion(1);
+    auto &tl = reset_reduction_vars_region[0];
+    tl.AddTask(
+        none,
+        [](StateDescriptor *hydro_pkg) {
+          hydro_pkg->UpdateParam("mindx", std::numeric_limits<Real>::max());
+          hydro_pkg->UpdateParam("dt_hyp", std::numeric_limits<Real>::max());
+          hydro_pkg->UpdateParam("dt_diff", std::numeric_limits<Real>::max());
+          return TaskStatus::complete;
+        },
+        hydro_pkg.get());
+  }
+
   if (stage == integrator->nstages) {
     TaskRegion &tr = tc.AddRegion(num_partitions);
     for (int i = 0; i < num_partitions; i++) {
diff --git a/src/main.cpp b/src/main.cpp
index 93f984bb..23cdb825 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -55,6 +55,7 @@ int main(int argc, char *argv[]) {
     pman.app_input->InitUserMeshData = linear_wave_mhd::InitUserMeshData;
     pman.app_input->ProblemGenerator = linear_wave_mhd::ProblemGenerator;
     pman.app_input->UserWorkAfterLoop = linear_wave_mhd::UserWorkAfterLoop;
+    Hydro::ProblemInitPackageData = linear_wave_mhd::ProblemInitPackageData;
   } else if (problem == "cpaw") {
     pman.app_input->InitUserMeshData = cpaw::InitUserMeshData;
     pman.app_input->ProblemGenerator = cpaw::ProblemGenerator;
@@ -62,8 +63,8 @@ int main(int argc, char *argv[]) {
   } else if (problem == "cloud") {
     pman.app_input->InitUserMeshData = cloud::InitUserMeshData;
     pman.app_input->ProblemGenerator = cloud::ProblemGenerator;
-    pman.app_input->boundary_conditions[parthenon::BoundaryFace::inner_x2] =
-        cloud::InflowWindX2;
+    pman.app_input->RegisterBoundaryCondition(parthenon::BoundaryFace::inner_x2,
+                                              "cloud_inflow_x2", cloud::InflowWindX2);
     Hydro::ProblemCheckRefinementBlock = cloud::ProblemCheckRefinementBlock;
   } else if (problem == "blast") {
     pman.app_input->InitUserMeshData = blast::InitUserMeshData;
@@ -80,7 +81,7 @@ int main(int argc, char *argv[]) {
     pman.app_input->ProblemGenerator = field_loop::ProblemGenerator;
     Hydro::ProblemInitPackageData = field_loop::ProblemInitPackageData;
   } else if (problem == "kh") {
-    pman.app_input->ProblemGenerator = kh::ProblemGenerator;
+    pman.app_input->MeshProblemGenerator = kh::ProblemGenerator;
   } else if (problem == "rand_blast") {
     pman.app_input->ProblemGenerator = rand_blast::ProblemGenerator;
     Hydro::ProblemInitPackageData = rand_blast::ProblemInitPackageData;
diff --git a/src/main.hpp b/src/main.hpp
index 033118ad..4c81c307 100644
--- a/src/main.hpp
+++ b/src/main.hpp
@@ -1,5 +1,5 @@
 // AthenaPK - a performance portable block structured AMR MHD code
-// Copyright (c) 2020-2021, Athena Parthenon Collaboration. All rights reserved.
+// Copyright (c) 2020-2024, Athena Parthenon Collaboration. All rights reserved.
 // Licensed under the 3-Clause License (the "LICENSE")
 
 #ifndef MAIN_HPP_
@@ -37,6 +37,10 @@ enum class Fluid { undefined, euler, glmmhd };
 enum class Cooling { none, tabular };
 enum class Conduction { none, isotropic, anisotropic };
 enum class ConductionCoeff { none, fixed, spitzer };
+enum class Viscosity { none, isotropic };
+enum class ViscosityCoeff { none, fixed };
+enum class Resistivity { none, ohmic };
+enum class ResistivityCoeff { none, fixed, spitzer };
 enum class DiffInt { none, unsplit, rkl2 };
 
 enum class Hst { idx, ekin, emag, divb };
diff --git a/src/pgen/diffusion.cpp b/src/pgen/diffusion.cpp
index 24ae60f2..269926f8 100644
--- a/src/pgen/diffusion.cpp
+++ b/src/pgen/diffusion.cpp
@@ -1,6 +1,6 @@
 
 // AthenaPK - a performance portable block structured AMR MHD code
-// Copyright (c) 2021-2023, Athena Parthenon Collaboration. All rights reserved.
+// Copyright (c) 2021-2024, Athena Parthenon Collaboration. All rights reserved.
 // Licensed under the 3-Clause License (the "LICENSE")
 
 // Parthenon headers
@@ -11,11 +11,13 @@
 
 // AthenaPK headers
 #include "../main.hpp"
+#include "utils/error_checking.hpp"
 
 namespace diffusion {
 using namespace parthenon::driver::prelude;
 
 void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
+  auto hydro_pkg = pmb->packages.Get("Hydro");
   IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
   IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior);
   IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::interior);
@@ -23,24 +25,39 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   auto &mbd = pmb->meshblock_data.Get();
   auto &u = mbd->Get("cons").data;
 
+  const auto gamma = pin->GetReal("hydro", "gamma");
+  const bool mhd_enabled = hydro_pkg->Param<Fluid>("fluid") == Fluid::glmmhd;
+
   const auto Bx = pin->GetOrAddReal("problem/diffusion", "Bx", 0.0);
   const auto By = pin->GetOrAddReal("problem/diffusion", "By", 0.0);
 
   const auto iprob = pin->GetInteger("problem/diffusion", "iprob");
+  PARTHENON_REQUIRE_THROWS(mhd_enabled || !(iprob == 0 || iprob == 1 || iprob == 2 ||
+                                            iprob == 10 || iprob == 20 || iprob == 40),
+                           "Selected iprob for diffusion pgen requires MHD enabled.")
   Real t0 = 0.5;
   Real diff_coeff = 0.0;
   Real amp = 1e-6;
-  // Get parameters for Gaussian profile
-  if (iprob == 10) {
-    diff_coeff = pin->GetReal("diffusion", "thermal_diff_coeff_code");
+  // Get common parameters for Gaussian profile
+  if ((iprob == 10) || (iprob == 30) || (iprob == 40)) {
     t0 = pin->GetOrAddReal("problem/diffusion", "t0", t0);
     amp = pin->GetOrAddReal("problem/diffusion", "amp", amp);
   }
+  // Heat diffusion of 1D Gaussian
+  if (iprob == 10) {
+    diff_coeff = pin->GetReal("diffusion", "thermal_diff_coeff_code");
+    // Viscous diffusion of 1D Gaussian
+  } else if (iprob == 30) {
+    diff_coeff = pin->GetReal("diffusion", "mom_diff_coeff_code");
+    // Ohmic diffusion of 1D Gaussian
+  } else if (iprob == 40) {
+    diff_coeff = pin->GetReal("diffusion", "ohm_diff_coeff_code");
+  }
 
   auto &coords = pmb->coords;
 
   pmb->par_for(
-      "ProblemGenerator: Diffusion Step", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      "ProblemGenerator: Diffusion", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i) {
         u(IDN, k, j, i) = 1.0;
 
@@ -48,11 +65,13 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
         u(IM2, k, j, i) = 0.0;
         u(IM3, k, j, i) = 0.0;
 
-        u(IB1, k, j, i) = 0.0;
-        u(IB2, k, j, i) = 0.0;
-        u(IB3, k, j, i) = 0.0;
+        if (mhd_enabled) {
+          u(IB1, k, j, i) = 0.0;
+          u(IB2, k, j, i) = 0.0;
+          u(IB3, k, j, i) = 0.0;
+        }
 
-        Real eint;
+        Real eint = -1.0;
         // step function x1
         if (iprob == 0) {
           u(IB1, k, j, i) = Bx;
@@ -111,12 +130,31 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
           u(IB3, k, j, i) = y / r;
           u(IB1, k, j, i) = -x / r;
           eint = std::abs(r - 0.6) < 0.1 && std::abs(phi) < M_PI / 12.0 ? 12.0 : 10.0;
+          // Viscous diffusion of 1D Gaussian
+        } else if (iprob == 30) {
+          u(IM2, k, j, i) =
+              u(IDN, k, j, i) * amp /
+              std::pow(std::sqrt(4. * M_PI * diff_coeff * t0), 1.0) *
+              std::exp(-(std::pow(coords.Xc<1>(i), 2.)) / (4. * diff_coeff * t0));
+          eint = 1.0 / (gamma * (gamma - 1.0)); // c_s = 1 everywhere
+          // Ohmic diffusion of 1D Gaussian
+        } else if (iprob == 40) {
+          u(IB2, k, j, i) =
+              amp / std::pow(std::sqrt(4. * M_PI * diff_coeff * t0), 1.0) *
+              std::exp(-(std::pow(coords.Xc<1>(i), 2.)) / (4. * diff_coeff * t0));
+          eint = 1.0 / (gamma * (gamma - 1.0)); // c_s = 1 everywhere
         }
+
+        PARTHENON_REQUIRE(eint > 0.0, "Missing init of eint");
         u(IEN, k, j, i) =
             u(IDN, k, j, i) * eint +
-            0.5 * (SQR(u(IB1, k, j, i)) + SQR(u(IB2, k, j, i)) + SQR(u(IB3, k, j, i)) +
-                   (SQR(u(IM1, k, j, i)) + SQR(u(IM2, k, j, i)) + SQR(u(IM3, k, j, i))) /
-                       u(IDN, k, j, i));
+            0.5 * ((SQR(u(IM1, k, j, i)) + SQR(u(IM2, k, j, i)) + SQR(u(IM3, k, j, i))) /
+                   u(IDN, k, j, i));
+
+        if (mhd_enabled) {
+          u(IEN, k, j, i) +=
+              0.5 * (SQR(u(IB1, k, j, i)) + SQR(u(IB2, k, j, i)) + SQR(u(IB3, k, j, i)));
+        }
       });
 }
 } // namespace diffusion
diff --git a/src/pgen/kh.cpp b/src/pgen/kh.cpp
index 547f2a2e..68936935 100644
--- a/src/pgen/kh.cpp
+++ b/src/pgen/kh.cpp
@@ -32,61 +32,32 @@
 
 // AthenaPK headers
 #include "../main.hpp"
+#include "utils/error_checking.hpp"
 
 namespace kh {
 using namespace parthenon::driver::prelude;
 
 //----------------------------------------------------------------------------------------
-//! \fn void MeshBlock::ProblemGenerator(ParameterInput *pin)
+//! \fn void Mesh::ProblemGenerator(Mesh *pm, ParameterInput *pin, MeshData<Real> *md)
 //  \brief Problem Generator for the Kelvin-Helmholtz test
 
-void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
+void ProblemGenerator(Mesh *pmesh, ParameterInput *pin, MeshData<Real> *md) {
   auto vflow = pin->GetReal("problem/kh", "vflow");
   auto iprob = pin->GetInteger("problem/kh", "iprob");
+  // Get pointer to first block (always exists) for common data like loop bounds
+  auto pmb = md->GetBlockData(0)->GetBlockPointer();
   auto ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
   auto jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior);
   auto kb = pmb->cellbounds.GetBoundsK(IndexDomain::interior);
   auto gam = pin->GetReal("hydro", "gamma");
   auto gm1 = (gam - 1.0);
 
-  // initialize conserved variables
-  auto &rc = pmb->meshblock_data.Get();
-  auto &u_dev = rc->Get("cons").data;
-  auto &coords = pmb->coords;
-  // initializing on host
-  auto u = u_dev.GetHostMirrorAndCopy();
+  // Initialize conserved variables
+  // Get a MeshBlockPack on device with all conserved variables
+  const auto &cons = md->PackVariables(std::vector<std::string>{"cons"});
+  const auto num_blocks = md->NumBlocks();
 
-  std::mt19937 gen(pmb->gid); // Standard mersenne_twister_engine seeded with gid
-  std::uniform_real_distribution<Real> ran(-0.5, 0.5);
-
-  //--- iprob=1.  Uniform stream with density ratio "drat" located in region -1/4<y<1/4
-  // moving at (-vflow) seperated by two slip-surfaces from background medium with d=1
-  // moving at (+vflow), random perturbations.  This is the classic, unresolved K-H test.
-
-  if (iprob == 1) {
-    // Read problem parameters
-    Real drat = pin->GetReal("problem/kh", "drat");
-    Real amp = pin->GetReal("problem/kh", "amp");
-    for (int k = kb.s; k <= kb.e; k++) {
-      for (int j = jb.s; j <= jb.e; j++) {
-        for (int i = ib.s; i <= ib.e; i++) {
-          u(IDN, k, j, i) = 1.0;
-          u(IM1, k, j, i) = vflow + amp * ran(gen);
-          u(IM2, k, j, i) = amp * ran(gen);
-          u(IM3, k, j, i) = 0.0;
-          if (std::abs(coords.Xc<2>(j)) < 0.25) {
-            u(IDN, k, j, i) = drat;
-            u(IM1, k, j, i) = -drat * (vflow + amp * ran(gen));
-            u(IM2, k, j, i) = drat * amp * ran(gen);
-          }
-          // Pressure scaled to give a sound speed of 1 with gamma=1.4
-          u(IEN, k, j, i) =
-              2.5 / gm1 +
-              0.5 * (SQR(u(IM1, k, j, i)) + SQR(u(IM2, k, j, i))) / u(IDN, k, j, i);
-        }
-      }
-    }
-  }
+  //--- iprob=1. This was the classic, unresolved K-H test.
 
   //--- iprob=2. Uniform density medium moving at +/-vflow seperated by a single shear
   // layer with tanh() profile at y=0 with a single mode perturbation, reflecting BCs at
@@ -97,9 +68,11 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
     Real amp = pin->GetReal("problem/kh", "amp");
     Real a = 0.02;
     Real sigma = 0.2;
-    for (int k = kb.s; k <= kb.e; k++) {
-      for (int j = jb.s; j <= jb.e; j++) {
-        for (int i = ib.s; i <= ib.e; i++) {
+    pmb->par_for(
+        "KHI: iprob2", 0, num_blocks - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
+          const auto &u = cons(b);
+          const auto &coords = cons.GetCoords(b);
           u(IDN, k, j, i) = 1.0;
           u(IM1, k, j, i) = vflow * std::tanh((coords.Xc<2>(j)) / a);
           u(IM2, k, j, i) = amp * std::cos(2.0 * M_PI * coords.Xc<1>(i)) *
@@ -108,23 +81,22 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
           u(IEN, k, j, i) =
               1.0 / gm1 +
               0.5 * (SQR(u(IM1, k, j, i)) + SQR(u(IM2, k, j, i))) / u(IDN, k, j, i);
-        }
-      }
-    }
-  }
+        });
 
-  //--- iprob=3.  Test in SR paper (Beckwith & Stone, ApJS 193, 6, 2011).  Gives two
-  // resolved shear layers with tanh() profiles for velocity and density located at
-  // y = +/- 0.5, density one in middle and 0.01 elsewhere, single mode perturbation.
+  } else if (iprob == 3) {
+    //--- iprob=3.  Test in SR paper (Beckwith & Stone, ApJS 193, 6, 2011).  Gives two
+    // resolved shear layers with tanh() profiles for velocity and density located at
+    // y = +/- 0.5, density one in middle and 0.01 elsewhere, single mode perturbation.
 
-  if (iprob == 3) {
     // Read/set problem parameters
     Real amp = pin->GetReal("problem/kh", "amp");
     Real a = 0.01;
     Real sigma = 0.1;
-    for (int k = kb.s; k <= kb.e; k++) {
-      for (int j = jb.s; j <= jb.e; j++) {
-        for (int i = ib.s; i <= ib.e; i++) {
+    pmb->par_for(
+        "KHI: iprob3", 0, num_blocks - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
+          const auto &u = cons(b);
+          const auto &coords = cons.GetCoords(b);
           u(IDN, k, j, i) =
               0.505 + 0.495 * std::tanh((std::abs(coords.Xc<2>(j)) - 0.5) / a);
           u(IM1, k, j, i) = vflow * std::tanh((std::abs(coords.Xc<2>(j)) - 0.5) / a);
@@ -139,17 +111,15 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
           u(IEN, k, j, i) =
               1.0 / gm1 +
               0.5 * (SQR(u(IM1, k, j, i)) + SQR(u(IM2, k, j, i))) / u(IDN, k, j, i);
-        }
-      }
-    }
-  }
+        });
 
-  //--- iprob=4.  "Lecoanet" test, resolved shear layers with tanh() profiles for velocity
-  // and density located at z1=0.5, z2=1.5 two-mode perturbation for fully periodic BCs
+  } else if (iprob == 4) {
+    //--- iprob=4.  "Lecoanet" test, resolved shear layers with tanh() profiles for
+    // velocity
+    // and density located at z1=0.5, z2=1.5 two-mode perturbation for fully periodic BCs
 
-  // To promote symmetry of FP errors about midplanes, rescale z' = z - 1. ; x' = x - 0.5
-  // so that domain x1 = [-0.5, 0.5] and x2 = [-1.0, 1.0] is centered about origin
-  if (iprob == 4) {
+    // To promote symmetry of FP errors about midplanes, rescale z' = z - 1. ; x' = x -
+    // 0.5 so that domain x1 = [-0.5, 0.5] and x2 = [-1.0, 1.0] is centered about origin
     // Read/set problem parameters
     Real amp = pin->GetReal("problem/kh", "amp");
     // unstratified problem is the default
@@ -164,9 +134,11 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
     Real z1 = -0.5; // z1' = z1 - 1.0
     Real z2 = 0.5;  // z2' = z2 - 1.0
 
-    for (int k = kb.s; k <= kb.e; k++) {
-      for (int j = jb.s; j <= jb.e; j++) {
-        for (int i = ib.s; i <= ib.e; i++) {
+    pmb->par_for(
+        "KHI: iprob4", 0, num_blocks - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
+          const auto &u = cons(b);
+          const auto &coords = cons.GetCoords(b);
           // Lecoanet (2015) equation 8a)
           Real dens = 1.0 + 0.5 * drho_rho0 *
                                 (std::tanh((coords.Xc<2>(j) - z1) / a) -
@@ -233,40 +205,39 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
               P0 / gm1 +
               0.5 * (SQR(u(IM1, k, j, i)) + SQR(u(IM2, k, j, i)) + SQR(u(IM3, k, j, i))) /
                   u(IDN, k, j, i);
-        }
-      }
-    }
-    // copy initialized vars to device
-    u_dev.DeepCopy(u);
-  }
+        });
 
-  //--- iprob=5. Uniform stream with density ratio "drat" located in region -1/4<y<1/4
-  // moving at (-vflow) seperated by two resolved slip-surfaces from background medium
-  // with d=1 moving at (+vflow), with m=2 perturbation, for the AMR test.
+  } else if (iprob == 5) {
+    //--- iprob=5. Uniform stream with density ratio "drat" located in region -1/4<y<1/4
+    // moving at (-vflow) seperated by two resolved slip-surfaces from background medium
+    // with d=1 moving at (+vflow), with m=2 perturbation, for the AMR test.
+    // Note that the code below should match what's in the Athena++ method paper (but it
+    // does not match anymore what's implemented in Athena++ itself).
 
-  if (iprob == 5) {
     // Read problem parameters
     Real a = pin->GetReal("problem/kh", "a");
     Real sigma = pin->GetReal("problem/kh", "sigma");
     Real drat = pin->GetReal("problem/kh", "drat");
     Real amp = pin->GetReal("problem/kh", "amp");
-    for (int k = kb.s; k <= kb.e; k++) {
-      for (int j = jb.s; j <= jb.e; j++) {
-        for (int i = ib.s; i <= ib.e; i++) {
+    pmb->par_for(
+        "KHI: iprob5", 0, num_blocks - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
+          const auto &u = cons(b);
+          const auto &coords = cons.GetCoords(b);
           Real w = (std::tanh((std::abs(coords.Xc<2>(j)) - 0.25) / a) + 1.0) * 0.5;
           u(IDN, k, j, i) = w + (1.0 - w) * drat;
-          u(IM1, k, j, i) = w * vflow - (1.0 - w) * vflow * drat;
+          u(IM1, k, j, i) = u(IDN, k, j, i) * vflow * (w - 0.5);
           u(IM2, k, j, i) =
-              u(IDN, k, j, i) * amp * std::sin(2.0 * 2.0 * M_PI * coords.Xc<1>(i)) *
+              u(IDN, k, j, i) * amp * std::cos(2.0 * 2.0 * M_PI * coords.Xc<1>(i)) *
               std::exp(-SQR(std::abs(coords.Xc<2>(j)) - 0.25) / (sigma * sigma));
           u(IM3, k, j, i) = 0.0;
           // Pressure scaled to give a sound speed of 1 with gamma=1.4
           u(IEN, k, j, i) =
               2.5 / gm1 +
-              0.25 * (SQR(u(IM1, k, j, i)) + SQR(u(IM2, k, j, i))) / u(IDN, k, j, i);
-        }
-      }
-    }
+              0.5 * (SQR(u(IM1, k, j, i)) + SQR(u(IM2, k, j, i))) / u(IDN, k, j, i);
+        });
+  } else {
+    PARTHENON_FAIL("Unknow iprob for KHI pgen.")
   }
 }
 
diff --git a/src/pgen/linear_wave_mhd.cpp b/src/pgen/linear_wave_mhd.cpp
index 67affbe7..6e2c105c 100644
--- a/src/pgen/linear_wave_mhd.cpp
+++ b/src/pgen/linear_wave_mhd.cpp
@@ -709,4 +709,38 @@ void Eigensystem(const Real d, const Real v1, const Real v2, const Real v3, cons
   left_eigenmatrix[6][6] = left_eigenmatrix[0][6];
 }
 
+// For decaying wave with diffusive processes test problem, dump max V_2
+Real HstMaxV2(MeshData<Real> *md) {
+  auto hydro_pkg = md->GetBlockData(0)->GetBlockPointer()->packages.Get("Hydro");
+
+  const auto &prim_pack = md->PackVariables(std::vector<std::string>{"prim"});
+
+  IndexRange ib = md->GetBlockData(0)->GetBoundsI(IndexDomain::interior);
+  IndexRange jb = md->GetBlockData(0)->GetBoundsJ(IndexDomain::interior);
+  IndexRange kb = md->GetBlockData(0)->GetBoundsK(IndexDomain::interior);
+
+  Real max_v2 = 0.0;
+
+  Kokkos::parallel_reduce(
+      "HstMaxV2",
+      Kokkos::MDRangePolicy<Kokkos::Rank<4>>(
+          parthenon::DevExecSpace(), {0, kb.s, jb.s, ib.s},
+          {prim_pack.GetDim(5), kb.e + 1, jb.e + 1, ib.e + 1},
+          {1, 1, 1, ib.e + 1 - ib.s}),
+      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &lmax) {
+        lmax = Kokkos::fmax(lmax, Kokkos::fabs(prim_pack(b, IV2, k, j, i)));
+      },
+      Kokkos::Max<Real>(max_v2));
+
+  return max_v2;
+}
+
+void ProblemInitPackageData(ParameterInput *pin, parthenon::StateDescriptor *pkg) {
+  if (pin->GetOrAddBoolean("problem/linear_wave", "dump_max_v2", false)) {
+    auto hst_vars = pkg->Param<parthenon::HstVar_list>(parthenon::hist_param_key);
+    hst_vars.emplace_back(parthenon::HistoryOutputVar(
+        parthenon::UserHistoryOperation::max, HstMaxV2, "MaxAbsV2"));
+    pkg->UpdateParam(parthenon::hist_param_key, hst_vars);
+  }
+}
 } // namespace linear_wave_mhd
diff --git a/src/pgen/pgen.hpp b/src/pgen/pgen.hpp
index b3e55aad..877a1f13 100644
--- a/src/pgen/pgen.hpp
+++ b/src/pgen/pgen.hpp
@@ -25,6 +25,7 @@ void InitUserMeshData(Mesh *mesh, ParameterInput *pin);
 void ProblemGenerator(MeshBlock *pmb, parthenon::ParameterInput *pin);
 void UserWorkAfterLoop(Mesh *mesh, parthenon::ParameterInput *pin,
                        parthenon::SimTime &tm);
+void ProblemInitPackageData(ParameterInput *pin, parthenon::StateDescriptor *pkg);
 } // namespace linear_wave_mhd
 
 namespace cpaw {
@@ -83,7 +84,7 @@ void ProblemInitPackageData(ParameterInput *pin, parthenon::StateDescriptor *pkg
 namespace kh {
 using namespace parthenon::driver::prelude;
 
-void ProblemGenerator(MeshBlock *pmb, parthenon::ParameterInput *pin);
+void ProblemGenerator(Mesh *pm, parthenon::ParameterInput *pin, MeshData<Real> *md);
 } // namespace kh
 namespace rand_blast {
 using namespace parthenon::driver::prelude;
diff --git a/src/units.hpp b/src/units.hpp
index 3290e87e..fc6a2412 100644
--- a/src/units.hpp
+++ b/src/units.hpp
@@ -90,7 +90,8 @@ class Units {
     return code_energy_cgs() / kev_cgs * code_length_cgs() * code_length_cgs();
   }
   parthenon::Real code_magnetic_cgs() const {
-    return sqrt(code_mass_cgs()) / sqrt(code_length_cgs()) / code_time_cgs();
+    return std::sqrt(4.0 * M_PI) * sqrt(code_mass_cgs()) / sqrt(code_length_cgs()) /
+           code_time_cgs();
   }
 
   // Physical Constants in code units
diff --git a/tst/regression/CMakeLists.txt b/tst/regression/CMakeLists.txt
index c43b4de5..f4fa246e 100644
--- a/tst/regression/CMakeLists.txt
+++ b/tst/regression/CMakeLists.txt
@@ -49,6 +49,12 @@ setup_test_both("aniso_therm_cond_ring_multid" "--driver ${PROJECT_BINARY_DIR}/b
 setup_test_both("aniso_therm_cond_gauss_conv" "--driver ${PROJECT_BINARY_DIR}/bin/athenaPK \
   --driver_input ${PROJECT_SOURCE_DIR}/inputs/diffusion.in --num_steps 24" "convergence")
 
+setup_test_both("diffusion" "--driver ${PROJECT_BINARY_DIR}/bin/athenaPK \
+  --driver_input ${PROJECT_SOURCE_DIR}/inputs/diffusion.in --num_steps 12" "convergence")
+
+  setup_test_both("diffusion_linwave3d" "--driver ${PROJECT_BINARY_DIR}/bin/athenaPK \
+  --driver_input ${PROJECT_SOURCE_DIR}/inputs/linear_wave3d.in --num_steps 2" "convergence")
+
 setup_test_both("field_loop" "--driver ${PROJECT_BINARY_DIR}/bin/athenaPK \
   --driver_input ${PROJECT_SOURCE_DIR}/inputs/field_loop.in --num_steps 12" "convergence")
 
diff --git a/tst/regression/test_suites/aniso_therm_cond_ring_conv/aniso_therm_cond_ring_conv.py b/tst/regression/test_suites/aniso_therm_cond_ring_conv/aniso_therm_cond_ring_conv.py
index ba13129e..41a2dcc6 100644
--- a/tst/regression/test_suites/aniso_therm_cond_ring_conv/aniso_therm_cond_ring_conv.py
+++ b/tst/regression/test_suites/aniso_therm_cond_ring_conv/aniso_therm_cond_ring_conv.py
@@ -53,6 +53,10 @@ def Prepare(self, parameters, step):
             "parthenon/meshblock/nx3=1",
             "problem/diffusion/iprob=20",
             "parthenon/time/tlim=200.0",
+            # Work around for RKL2 integrator (that, by default, does not limit the
+            # timestep, which in newer versions of Parthenon results in triggering
+            # a fail-safe given the default init value of numeric_limits max.
+            "parthenon/time/dt_ceil=200.0",
             "parthenon/output0/dt=200.0",
             f"parthenon/output0/id={res}",
         ]
diff --git a/tst/regression/test_suites/aniso_therm_cond_ring_multid/aniso_therm_cond_ring_multid.py b/tst/regression/test_suites/aniso_therm_cond_ring_multid/aniso_therm_cond_ring_multid.py
index 3f933507..74083a63 100644
--- a/tst/regression/test_suites/aniso_therm_cond_ring_multid/aniso_therm_cond_ring_multid.py
+++ b/tst/regression/test_suites/aniso_therm_cond_ring_multid/aniso_therm_cond_ring_multid.py
@@ -87,6 +87,7 @@ def Prepare(self, parameters, step):
             "parthenon/time/tlim=200.0",
             "parthenon/output0/dt=200.0",
             f"parthenon/output0/id={step}",
+            "diffusion/integrator=unsplit",
         ]
 
         return parameters
diff --git a/tst/regression/test_suites/diffusion/__init__.py b/tst/regression/test_suites/diffusion/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tst/regression/test_suites/diffusion/diffusion.py b/tst/regression/test_suites/diffusion/diffusion.py
new file mode 100644
index 00000000..47a0a8b3
--- /dev/null
+++ b/tst/regression/test_suites/diffusion/diffusion.py
@@ -0,0 +1,217 @@
+# ========================================================================================
+# AthenaPK - a performance portable block structured AMR MHD code
+# Copyright (c) 2023-2024, Athena Parthenon Collaboration. All rights reserved.
+# Licensed under the 3-clause BSD License, see LICENSE file for details
+# ========================================================================================
+
+# Modules
+import math
+import numpy as np
+import matplotlib
+
+matplotlib.use("agg")
+import matplotlib.pylab as plt
+import sys
+import os
+import itertools
+import utils.test_case
+from scipy.optimize import curve_fit
+
+# To prevent littering up imported folders with .pyc files or __pycache_ folder
+sys.dont_write_bytecode = True
+
+diff_cfgs = ["visc", "ohm"]
+int_cfgs = ["unsplit", "rkl2"]
+res_cfgs = [256, 512, 1024]
+tlim = 2.0
+diff_coeff = 0.25
+
+all_cfgs = list(itertools.product(diff_cfgs, res_cfgs, int_cfgs))
+
+
+def get_outname(all_cfg):
+    diff, res, int_cfg = all_cfg
+    return f"{diff}_{res}_{int_cfg}"
+
+
+class TestCase(utils.test_case.TestCaseAbs):
+    def Prepare(self, parameters, step):
+        assert parameters.num_ranks <= 4, "Use <= 4 ranks for diffusion test."
+
+        diff, res, int_cfg = all_cfgs[step - 1]
+
+        outname = get_outname(all_cfgs[step - 1])
+
+        if diff == "visc":
+            fluid_ = "euler"
+            iprob_ = 30
+            viscosity_ = "isotropic"
+            resistivity_ = "none"
+        elif diff == "ohm":
+            fluid_ = "glmmhd"
+            iprob_ = 40
+            viscosity_ = "none"
+            resistivity_ = "ohmic"
+
+        parameters.driver_cmd_line_args = [
+            "parthenon/mesh/nx1=%d" % res,
+            "parthenon/meshblock/nx1=64",
+            "parthenon/mesh/x1min=-6.0",
+            "parthenon/mesh/x1max=6.0",
+            "parthenon/mesh/ix1_bc=outflow",
+            "parthenon/mesh/ox1_bc=outflow",
+            "parthenon/mesh/nx2=1",
+            "parthenon/meshblock/nx2=1",
+            "parthenon/mesh/x2min=-1.0",
+            "parthenon/mesh/x2max=1.0",
+            "parthenon/mesh/nx3=1",
+            "parthenon/meshblock/nx3=1",
+            f"parthenon/output0/id={outname}",
+            f"parthenon/time/tlim={tlim}",
+            # Work around for RKL2 integrator (that, by default, does not limit the
+            # timestep, which in newer versions of Parthenon results in triggering
+            # a fail-safe given the default init value of numeric_limits max.
+            "parthenon/time/dt_ceil=%f" % tlim,
+            f"hydro/fluid={fluid_}",
+            "hydro/gamma=1.4",
+            "hydro/cfl=0.8",
+            "hydro/integrator=rk2",
+            f"problem/diffusion/iprob={iprob_}",
+            f"problem/diffusion/Bx=0.0",
+            f"problem/diffusion/By=0.0",
+            "diffusion/conduction=none",
+            f"diffusion/viscosity={viscosity_}",
+            f"diffusion/resistivity={resistivity_}",
+            # we can set both as their activity is controlled separately
+            f"diffusion/mom_diff_coeff_code={diff_coeff}",
+            f"diffusion/ohm_diff_coeff_code={diff_coeff}",
+            f"diffusion/integrator={int_cfg}",
+            f"diffusion/rkl2_max_dt_ratio=200",
+        ]
+
+        return parameters
+
+    def Analyse(self, parameters):
+        sys.path.insert(
+            1,
+            parameters.parthenon_path
+            + "/scripts/python/packages/parthenon_tools/parthenon_tools",
+        )
+
+        try:
+            import phdf
+        except ModuleNotFoundError:
+            print("Couldn't find module to read Parthenon hdf5 files.")
+            return False
+
+        tests_passed = True
+
+        def get_ref(x):
+            return (
+                1e-6
+                / np.sqrt(4.0 * np.pi * diff_coeff * (0.5 + tlim))
+                * np.exp(-(x**2.0) / (4.0 * diff_coeff * (0.5 + tlim)))
+            )
+
+        num_diff = len(diff_cfgs)
+        for idx_diff in range(num_diff):
+            num_rows = len(res_cfgs)
+            num_cols = len(int_cfgs)
+            fig, p = plt.subplots(num_rows + 1, 2, sharey="row", sharex="row")
+
+            l1_err = np.zeros((len(int_cfgs), len(res_cfgs)))
+            for step in range(len(all_cfgs)):
+                diff, res, int_cfg = all_cfgs[step]
+                # only plot results for this diffusion process
+                if idx_diff != diff_cfgs.index(diff):
+                    continue
+                outname = get_outname(all_cfgs[step])
+                data_filename = (
+                    f"{parameters.output_path}/parthenon.{outname}.final.phdf"
+                )
+                data_file = phdf.phdf(data_filename)
+                # Flatten=true (default) is currently (Sep 24) broken so we manually flatten
+                components = data_file.GetComponents(
+                    data_file.Info["ComponentNames"], flatten=False
+                )
+                zz, yy, xx = data_file.GetVolumeLocations()
+                mask = yy == yy[0]
+                if diff == "visc":
+                    var_name = "prim_velocity_2"
+                elif diff == "ohm":
+                    var_name = "prim_magnetic_field_2"
+                else:
+                    print("Unknon diffusion type to process test results!")
+                    return False
+
+                v2 = components[var_name].ravel()[mask]
+                x = xx[mask]
+                row = res_cfgs.index(res)
+                col = int_cfgs.index(int_cfg)
+
+                v2_ref = get_ref(x)
+                l1 = np.average(np.abs(v2 - v2_ref))
+                l1_err[
+                    int_cfgs.index(int_cfg),
+                    res_cfgs.index(res),
+                ] = l1
+                p[row, col].plot(x, v2, label=f"N={res} L$_1$={l1:.2g}")
+
+            # Plot convergence
+            for j, int_cfg in enumerate(int_cfgs):
+                p[0, j].set_title(f"Integrator: {int_cfg}")
+
+                p[-1, j].plot(
+                    res_cfgs,
+                    l1_err[j, :],
+                    label=f"data",
+                )
+
+                # Simple convergence estimator
+                conv_model = lambda log_n, log_a, conv_rate: conv_rate * log_n + log_a
+                popt, pconv = curve_fit(
+                    conv_model, np.log(res_cfgs), np.log(l1_err[j, :])
+                )
+                conv_a, conv_measured = popt
+                # Note that the RKL2 convergence on the plots is currently significantly better
+                # than expected (<-3) though the L1 errors themself are larger than the unsplit
+                # integrator (as expected).
+                # For a more reasonable test (which would take longer), reduce the RKL2 ratio to,
+                # say, 200 and extend the resolution grid to 1024 (as the first data point at N=128
+                # is comparatively worse than at N>128).
+                if conv_measured > -1.95:
+                    print(
+                        f"!!!\nConvergence for test with {int_cfg} integrator "
+                        f"is worse ({conv_measured}) than expected (-1.95).\n!!!"
+                    )
+                    tests_passed = False
+                p[-1, j].plot(
+                    res_cfgs,
+                    np.exp(conv_a) * res_cfgs**conv_measured,
+                    ":",
+                    lw=0.75,
+                    label=f"Measured conv: {conv_measured:.2f}",
+                )
+
+            p[-1, 0].set_xscale("log")
+            p[-1, 0].set_yscale("log")
+            p[-1, 0].legend(fontsize=6)
+            p[-1, 1].legend(fontsize=6)
+
+            # Plot reference lines
+            x = np.linspace(-6, 6, 400)
+            for i in range(num_rows):
+                for j in range(num_cols):
+                    y = get_ref(x)
+                    p[i, j].plot(x, y, "-", lw=0.5, color="black", alpha=0.8)
+                    p[i, j].grid()
+                    p[i, j].legend(fontsize=6)
+
+            fig.tight_layout()
+            fig.savefig(
+                os.path.join(parameters.output_path, f"{diff_cfgs[idx_diff]}.png"),
+                bbox_inches="tight",
+                dpi=300,
+            )
+
+        return tests_passed
diff --git a/tst/regression/test_suites/diffusion_linwave3d/__init__.py b/tst/regression/test_suites/diffusion_linwave3d/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tst/regression/test_suites/diffusion_linwave3d/diffusion_linwave3d.py b/tst/regression/test_suites/diffusion_linwave3d/diffusion_linwave3d.py
new file mode 100644
index 00000000..d684a55c
--- /dev/null
+++ b/tst/regression/test_suites/diffusion_linwave3d/diffusion_linwave3d.py
@@ -0,0 +1,183 @@
+# ========================================================================================
+# AthenaPK - a performance portable block structured AMR MHD code
+# Copyright (c) 2023-2024, Athena Parthenon Collaboration. All rights reserved.
+# Licensed under the 3-clause BSD License, see LICENSE file for details
+# ========================================================================================
+
+# Modules
+import math
+import numpy as np
+import matplotlib
+
+matplotlib.use("agg")
+import matplotlib.pylab as plt
+import sys
+import os
+import utils.test_case
+from numpy.polynomial import Polynomial
+
+""" To prevent littering up imported folders with .pyc files or __pycache_ folder"""
+sys.dont_write_bytecode = True
+
+# if this is updated make sure to update the assert statements for the number of MPI ranks, too
+lin_res = [16, 32]  # resolution for linear convergence
+
+# Upper bound on relative L1 error for each above nx1:
+error_rel_tols = [0.22, 0.05]
+# lower bound on convergence rate at final (Nx1=64) asymptotic convergence regime
+rate_tols = [2.0]  # convergence rate > 3.0 for this particular resolution, sovler
+
+method = "explicit"
+
+_nu = 0.01
+_kappa = _nu * 2.0
+_eta = _kappa
+_c_s = 0.5  # slow mode wave speed of AthenaPK linear wave configuration
+
+
+class TestCase(utils.test_case.TestCaseAbs):
+    def Prepare(self, parameters, step):
+        res = lin_res[(step - 1)]
+        # make sure we can evenly distribute the MeshBlock sizes
+        err_msg = "Num ranks must be multiples of 2 for test."
+        assert parameters.num_ranks == 1 or parameters.num_ranks % 2 == 0, err_msg
+        # ensure a minimum block size of 4
+        assert 2 * res / parameters.num_ranks >= 8, "Use <= 8 ranks for test."
+
+        mb_nx1 = (2 * res) // parameters.num_ranks
+        # ensure that nx1 is <= 128 when using scratch (V100 limit on test system)
+        while mb_nx1 > 128:
+            mb_nx1 //= 2
+
+        parameters.driver_cmd_line_args = [
+            f"parthenon/mesh/nx1={2 * res}",
+            f"parthenon/meshblock/nx1={mb_nx1}",
+            f"parthenon/mesh/nx2={res}",
+            f"parthenon/meshblock/nx2={res}",
+            f"parthenon/mesh/nx3={res}",
+            f"parthenon/meshblock/nx3={res}",
+            "parthenon/mesh/nghost=2",
+            "parthenon/time/integrator=vl2",
+            "parthenon/time/tlim=3.0",
+            "hydro/reconstruction=plm",
+            "hydro/fluid=glmmhd",
+            "hydro/riemann=hlld",
+            # enable history dump to track decay of v2 component
+            "parthenon/output2/file_type=hst",
+            "parthenon/output2/dt=0.03",
+            "problem/linear_wave/dump_max_v2=true",
+            f"parthenon/job/problem_id={res}",  # hack to rename parthenon.hst to res.hst
+            # setup linear wave (L slow mode)
+            "job/problem_id=linear_wave_mhd",
+            "problem/linear_wave/amp=1e-4",
+            "problem/linear_wave/wave_flag=2",
+            "problem/linear_wave/compute_error=false",  # done here below, not in the pgen
+            # setup diffusive processes
+            "diffusion/integrator=unsplit",
+            "diffusion/conduction=isotropic",
+            "diffusion/conduction_coeff=fixed",
+            f"diffusion/thermal_diff_coeff_code={_kappa}",
+            "diffusion/viscosity=isotropic",
+            "diffusion/viscosity_coeff=fixed",
+            f"diffusion/mom_diff_coeff_code={_nu}",
+            "diffusion/resistivity=ohmic",
+            "diffusion/resistivity_coeff=fixed",
+            f"diffusion/ohm_diff_coeff_code={_eta}",
+        ]
+
+        return parameters
+
+    def Analyse(self, parameters):
+        analyze_status = True
+
+        # Following test evaluation is adapted from the one in Athena++.
+        # This also includes the limits/tolerances set above, which are identical to Athena++.
+
+        # Lambda=1 for Athena++'s linear wave setups in 1D, 2D, and 3D:
+        L = 1.0
+        ksqr = (2.0 * np.pi / L) ** 2
+        # Equation 3.13 from Ryu, et al. (modified to add thermal conduction term)
+        # fast mode decay rate = (19\nu/4 + 3\eta + 3(\gamma-1)^2*kappa/gamma/4)*(2/15)*k^2
+        # Equation 3.14 from Ryu, et al. (modified to add thermal conduction term)
+        # slow mode decay rate = (4\nu + 3\eta/4 + 3(\gamma-1)^2*kappa/gamma)*(2/15)*k^2
+        slow_mode_rate = (
+            (4.0 * _nu + 3.0 * _eta / 4.0 + _kappa * 4.0 / 5.0) * (2.0 / 15.0) * ksqr
+        )
+
+        # Equation 3.16
+        re_num = (4.0 * np.pi**2 * _c_s) / (L * slow_mode_rate)
+        analyze_status = True
+        errors_abs = []
+
+        for nx, err_tol in zip(lin_res, error_rel_tols):
+            print(
+                "[Decaying 3D Linear Wave]: "
+                "Mesh size {} x {} x {}".format(2 * nx, nx, nx)
+            )
+            filename = os.path.join(parameters.output_path, f"{nx}.out2.hst")
+            hst_data = np.genfromtxt(filename, names=True, skip_header=1)
+
+            tt = hst_data["1time"]
+            max_vy = hst_data["13MaxAbsV2"]
+            # estimate the decay rate from simulation, using weighted least-squares (WLS)
+            yy = np.log(np.abs(max_vy))
+            plt.plot(tt, yy)
+            plt.show()
+            p, [resid, rank, sv, rcond] = Polynomial.fit(
+                tt, yy, 1, w=np.sqrt(max_vy), full=True
+            )
+            resid_normal = np.sum((yy - p(tt)) ** 2)
+            r2 = 1 - resid_normal / (yy.size * yy.var())
+            pnormal = p.convert(domain=(-1, 1))
+            fit_rate = -pnormal.coef[-1]
+
+            error_abs = np.fabs(slow_mode_rate - fit_rate)
+            errors_abs += [error_abs]
+            error_rel = np.fabs(slow_mode_rate / fit_rate - 1.0)
+            err_rel_tol_percent = err_tol * 100.0
+
+            print(
+                "[Decaying 3D Linear Wave {}]: Reynolds number of slow mode: {}".format(
+                    method, re_num
+                )
+            )
+            print(
+                "[Decaying 3D Linear Wave {}]: R-squared of WLS regression = {}".format(
+                    method, r2
+                )
+            )
+            print(
+                "[Decaying 3D Linear Wave {}]: Analytic decay rate = {}".format(
+                    method, slow_mode_rate
+                )
+            )
+            print(
+                "[Decaying 3D Linear Wave {}]: Measured decay rate = {}".format(
+                    method, fit_rate
+                )
+            )
+            print(
+                "[Decaying 3D Linear Wave {}]: Decay rate absolute error = {}".format(
+                    method, error_abs
+                )
+            )
+            print(
+                "[Decaying 3D Linear Wave {}]: Decay rate relative error = {}".format(
+                    method, error_rel
+                )
+            )
+
+            if error_rel > err_tol:
+                print(
+                    "WARN [Decaying 3D Linear Wave {}]: decay rate disagrees"
+                    " with prediction by >{}%".format(method, err_rel_tol_percent)
+                )
+                analyze_status = False
+            else:
+                print(
+                    "[Decaying 3D Linear Wave {}]: decay rate is within "
+                    "{}% of analytic value".format(method, err_rel_tol_percent)
+                )
+                print("")
+
+        return analyze_status