Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add comprehensive checks for problem cells #631

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
7 changes: 7 additions & 0 deletions docs/documentation/case.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,18 @@ Definition of the parameters is described in the following subsections.
| ---: | :----: | :--- |
| `run_time_info` | Logical | Output run-time information |
| `rdma_mpi` | Logical | (GPUs) Enable RDMA for MPI communication. |
| `comp_debug` | Logical | Comprehensive variable checking |

- `run_time_info` generates a text file that includes run-time information including the CFL number(s) at each time-step.
- `rdma_mpi` optimizes data transfers between GPUs using Remote Direct Memory Access (RDMA).
The underlying MPI implementation and communication infrastructure must support this
feature, detecting GPU pointers and performing RDMA accordingly.
- `comp_debug` enables comprehensive error checking.
At each Runge-Kutta sub-step, all conservative variables are checked for NaNs.
The volume fractions are checked to ensure they are in the range [0, 1].
Negative densities are also checked for.
If any of these checks find problems, the file `comp_debug.txt` will be written to the case directory with information
about what problems were found and the simulation state will be saved for visualization.

### 2. Computational Domain

Expand Down
1 change: 1 addition & 0 deletions examples/3D_performance_test/comp_debug.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
-1
2 changes: 2 additions & 0 deletions src/post_process/m_global_parameters.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ module m_global_parameters
logical :: mixture_err !< Mixture error limiter
logical :: alt_soundspeed !< Alternate sound speed
logical :: hypoelasticity !< Turn hypoelasticity on
logical :: comp_debug !< Turn on comprehensive debug
logical, parameter :: chemistry = .${chemistry}$. !< Chemistry modeling
!> @}

Expand Down Expand Up @@ -317,6 +318,7 @@ contains
relax = .false.
relax_model = dflt_int
hypoelasticity = .false.
comp_debug = .false.

bc_x%beg = dflt_int; bc_x%end = dflt_int
bc_y%beg = dflt_int; bc_y%end = dflt_int
Expand Down
2 changes: 1 addition & 1 deletion src/post_process/m_mpi_proxy.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ contains
& 'heat_ratio_wrt', 'pi_inf_wrt', 'pres_inf_wrt', 'cons_vars_wrt', &
& 'prim_vars_wrt', 'c_wrt', 'qm_wrt','schlieren_wrt', 'bubbles', 'qbmm', &
& 'polytropic', 'polydisperse', 'file_per_process', 'relax', 'cf_wrt', &
& 'adv_n', 'ib', 'cfl_adap_dt', 'cfl_const_dt', 'cfl_dt' ]
& 'adv_n', 'ib', 'cfl_adap_dt', 'cfl_const_dt', 'cfl_dt', 'comp_debug' ]
call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
#:endfor

Expand Down
52 changes: 51 additions & 1 deletion src/post_process/m_start_up.f90
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ subroutine s_read_input_file
polydisperse, poly_sigma, file_per_process, relax, &
relax_model, cf_wrt, sigma, adv_n, ib, &
cfl_adap_dt, cfl_const_dt, t_save, t_stop, n_start, &
cfl_target
cfl_target, comp_debug

! Inquiring the status of the post_process.inp file
file_loc = 'post_process.inp'
Expand Down Expand Up @@ -184,6 +184,56 @@ subroutine s_perform_time_step(t_step)

end subroutine s_perform_time_step

subroutine s_perform_comprehensive_debug(varname, pres, c, H)

real(kind(0d0)) :: pres
real(kind(0d0)) :: c
real(kind(0d0)) :: H
integer :: t_fail, ios
logical :: exists
character(LEN=name_len) :: varname !<
character(LEN=name_len) :: file_name = 'comp_debug.txt'
character(LEN=path_len + name_len) :: file_path

! Opening the run-time information file
file_path = trim(case_dir)//'/'//trim(file_name)

inquire (file=file_path, exist=exists)
if (exists) then
open (12, file=file_path)
! Read the file line by line
do
read (12, '(I9)', iostat=ios) t_fail
if (ios /= 0) exit ! Exit loop on error or end-of-file
end do
if (t_fail == -1) return
else
return
end if

if (proc_rank == 0) then
print("(A, I6)"), " Post Processing suspicious time-step: ", t_fail
end if

! Populating the grid and conservative variables
call s_read_data_files(t_fail)
! Populating the buffer regions of the grid variables
if (buff_size > 0) then
call s_populate_grid_variables_buffer_regions()
end if

! Populating the buffer regions of the conservative variables
if (buff_size > 0) then
call s_populate_conservative_variables_buffer_regions()
end if

! Converting the conservative variables to the primitive ones
call s_convert_conservative_to_primitive_variables(q_cons_vf, q_prim_vf)

call s_save_data(t_fail, varname, pres, c, H)

end subroutine s_perform_comprehensive_debug

subroutine s_save_data(t_step, varname, pres, c, H)

integer, intent(inout) :: t_step
Expand Down
2 changes: 2 additions & 0 deletions src/post_process/p_main.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ program p_main

call s_initialize_modules()

if (comp_debug) call s_perform_comprehensive_debug(varname, pres, c, H)

if (cfl_dt) then
t_step = n_start
n_save = int(t_stop/t_save) + 1
Expand Down
4 changes: 4 additions & 0 deletions src/simulation/m_global_parameters.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ module m_global_parameters
logical :: hypoelasticity !< hypoelasticity modeling
logical, parameter :: chemistry = .${chemistry}$. !< Chemistry modeling
logical :: cu_tensor
logical :: comp_debug !< Variable checking at every RK step

logical :: bodyForces
logical :: bf_x, bf_y, bf_z !< body force toggle in three directions
Expand Down Expand Up @@ -637,6 +638,9 @@ contains
! Cuda aware MPI
cu_tensor = .false.

! Comprehensive debugging
comp_debug = .false.

bodyForces = .false.
bf_x = .false.; bf_y = .false.; bf_z = .false.
!< amplitude, frequency, and phase shift sinusoid in each direction
Expand Down
2 changes: 1 addition & 1 deletion src/simulation/m_mpi_proxy.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ contains
& 'polydisperse', 'qbmm', 'acoustic_source', 'probe_wrt', 'integral_wrt', &
& 'prim_vars_wrt', 'weno_avg', 'file_per_process', 'relax', &
& 'adv_n', 'adap_dt', 'ib', 'bodyForces', 'bf_x', 'bf_y', 'bf_z', &
& 'cfl_adap_dt', 'cfl_const_dt', 'cfl_dt' ]
& 'cfl_adap_dt', 'cfl_const_dt', 'cfl_dt', 'comp_debug' ]
call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
#:endfor

Expand Down
85 changes: 84 additions & 1 deletion src/simulation/m_sim_helpers.f90
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@ module m_sim_helpers
use m_global_parameters

use m_variables_conversion

use m_mpi_proxy
! ==========================================================================

implicit none

private; public :: s_compute_enthalpy, &
s_compute_stability_from_dt, &
s_compute_dt_from_cfl
s_compute_dt_from_cfl, &
s_check_cells

contains

Expand Down Expand Up @@ -245,4 +248,84 @@ subroutine s_compute_dt_from_cfl(vel, c, max_dt, rho, Re_l, j, k, l)

end subroutine s_compute_dt_from_cfl

subroutine s_check_cells(q_cons_Vf, t_step, stage, errors)

type(scalar_field), dimension(sys_size) :: q_cons_vf
integer, intent(in) :: t_step, stage
integer :: j, k, l, i
integer errors
logical :: exists

character(LEN=name_len) :: file_name = 'comp_debug.txt'
character(LEN=path_len + name_len) :: file_path
character(100) :: str_format

! Opening the run-time information file
file_path = trim(case_dir)//'/'//trim(file_name)

str_format = "(I9, A, I3, A, I4, I4, I4, A, I2, A, I5, A, I5, I5, I5)"

open (12, FILE=trim(file_path), &
STATUS='replace')

errors = 0

! Check all variables for NaNs
do i = 1, sys_size
!$acc update host(q_cons_vf(i)%sf)
do l = 0, p
do k = 0, n
do j = 0, m
if (ieee_is_nan(q_cons_vf(i)%sf(j, k, l))) then
write (12, str_format) t_step, " NaN(s) in conservative variables after RK stage ", &
stage, " at (j,k,l) ", j, k, l, " equation", i, " proc", proc_rank, &
" (m, n, p)", m, n, p
errors = errors + 1
end if
end do
end do
end do
end do

! Check for invalid volume fractions
do i = advxb, advxe
do l = 0, p
do k = 0, n
do j = 0, m
if (q_cons_vf(i)%sf(j, k, l) < 0d0) then
write (12, str_format) t_step, " Volume fraction < 0 after RK stage ", &
stage, " at (j,k,l) ", j, k, l, " equation", i, " proc", proc_rank, &
" (m, n, p)", m, n, p
errors = errors + 1
elseif (q_cons_vf(i)%sf(j, k, l) > 1d0 + verysmall) then
write (12, str_format) t_step, " Volume fraction > 1 after RK stage ", &
stage, " at (j,k,l) ", j, k, l, " equation", i, " proc", proc_rank, &
" (m, n, p)", m, n, p
errors = errors + 1
end if
end do
end do
end do
end do

! Check for invalid densities
do i = contxb, contxe
do l = 0, p
do k = 0, n
do j = 0, m
if (q_cons_vf(advxb + i - 1)%sf(j, k, l) < 0d0 .and. q_cons_vf(i)%sf(j, k, l) < 0d0 .or. &
q_cons_vf(advxb + i - 1)%sf(j, k, l) > 0d0 .and. q_cons_Vf(i)%sf(j, k, l) < 0d0) then
print *, q_cons_vf(advxb + i - 1)%sf(j, k, l), q_cons_vf(i)%sf(j, k, l)
write (12, str_format) t_step, " Density is negative after RK stage ", &
stage, " at (j,k,l) ", j, k, l, " equation", i, " proc", proc_rank, &
" (m, n, p)", m, n, p
errors = errors + 1
end if
end do
end do
end do
end do

end subroutine

end module m_sim_helpers
3 changes: 2 additions & 1 deletion src/simulation/m_start_up.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,8 @@ contains
pi_fac, adv_n, adap_dt, bf_x, bf_y, bf_z, &
k_x, k_y, k_z, w_x, w_y, w_z, p_x, p_y, p_z, &
g_x, g_y, g_z, n_start, t_save, t_stop, &
cfl_adap_dt, cfl_const_dt, cfl_target
cfl_adap_dt, cfl_const_dt, cfl_target, &
comp_debug

! Checking that an input file has been provided by the user. If it
! has, then the input file is read in, otherwise, simulation exits.
Expand Down
37 changes: 37 additions & 0 deletions src/simulation/m_time_steppers.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ module m_time_steppers
use m_thermochem

use m_body_forces

use ieee_arithmetic

use m_sim_helpers
! ==========================================================================

implicit none
Expand Down Expand Up @@ -411,6 +415,8 @@ contains

if (model_eqns == 3) call s_pressure_relaxation_procedure(q_cons_ts(1)%vf)

if (comp_debug) call s_comprehensive_debug(q_cons_ts(1)%vf, t_step, 1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you're going to add this call after every time step, why not put it outside the specific time stepper loop and instead in p_main? (or whatever calls the specific time stepper)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This called after each stage of a time-step, so it's called three times for one RK3 timestep

Copy link
Member

@sbryngelson sbryngelson Oct 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then shouldn't it be in s_compute_rhs? (or, shouldn't there be a compute_timestep_stage subroutine that does compute_rhs + s_pressure_relaxation + whatever_else)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Refactoring the time stepping routines completely would seem to violate "This PR comprises a set of related changes with a common goal" in my opinion.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps. Then we can do that one first.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not convinced this change is possible since the update step for each substep is different.
Step 1

q_cons_ts(2)%vf(i)%sf(j, k, l) = &
                            q_cons_ts(1)%vf(i)%sf(j, k, l) &
                            + dt*rhs_vf(i)%sf(j, k, l)

Step 2

q_cons_ts(2)%vf(i)%sf(j, k, l) = &
                            (3d0*q_cons_ts(1)%vf(i)%sf(j, k, l) &
                             + q_cons_ts(2)%vf(i)%sf(j, k, l) &
                             + dt*rhs_vf(i)%sf(j, k, l))/4d0

Step 3

q_cons_ts(1)%vf(i)%sf(j, k, l) = &
                            (q_cons_ts(1)%vf(i)%sf(j, k, l) &
                             + 2d0*q_cons_ts(2)%vf(i)%sf(j, k, l) &
                             + 2d0*dt*rhs_vf(i)%sf(j, k, l))/3d0

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Well, at the moment, each time stepper stage is followed by 80 lines of (at least) copy/pasting. The beginning and end of each time step (not stage) has 10 or so lines of copy/pasting.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

        call s_compute_rhs(q_cons_ts(1)%vf, q_prim_vf, rhs_vf, pb_ts(1)%sf, rhs_pb, mv_ts(1)%sf, rhs_mv, t_step, time_avg)

        if (run_time_info) then
            call s_write_run_time_information(q_prim_vf, t_step)
        end if

        if (probe_wrt) then
            call s_time_step_cycling(t_step)
        end if

        if (cfl_dt) then
            if (mytime >= t_stop) return
        else
            if (t_step == t_step_stop) return
        end if

        !$acc parallel loop collapse(4) gang vector default(present)
        do i = 1, sys_size
            do l = 0, p
                do k = 0, n
                    do j = 0, m
                        q_cons_ts(2)%vf(i)%sf(j, k, l) = &
                            q_cons_ts(1)%vf(i)%sf(j, k, l) &
                            + dt*rhs_vf(i)%sf(j, k, l)
                    end do
                end do
            end do
        end do

        !Evolve pb and mv for non-polytropic qbmm
        if (qbmm .and. (.not. polytropic)) then
            !$acc parallel loop collapse(5) gang vector default(present)
            do i = 1, nb
                do l = 0, p
                    do k = 0, n
                        do j = 0, m
                            do q = 1, nnode
                                pb_ts(2)%sf(j, k, l, q, i) = &
                                    pb_ts(1)%sf(j, k, l, q, i) &
                                    + dt*rhs_pb(j, k, l, q, i)
                            end do
                        end do
                    end do
                end do
            end do
        end if

        if (qbmm .and. (.not. polytropic)) then
            !$acc parallel loop collapse(5) gang vector default(present)
            do i = 1, nb
                do l = 0, p
                    do k = 0, n
                        do j = 0, m
                            do q = 1, nnode
                                mv_ts(2)%sf(j, k, l, q, i) = &
                                    mv_ts(1)%sf(j, k, l, q, i) &
                                    + dt*rhs_mv(j, k, l, q, i)
                            end do
                        end do
                    end do
                end do
            end do
        end if

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll condense some of this then


if (adv_n) call s_comp_alpha_from_n(q_cons_ts(1)%vf)

if (ib) then
Expand Down Expand Up @@ -518,6 +524,8 @@ contains
call s_pressure_relaxation_procedure(q_cons_ts(2)%vf)
end if

if (comp_debug) call s_comprehensive_debug(q_cons_ts(1)%vf, t_step, 1)

if (adv_n) call s_comp_alpha_from_n(q_cons_ts(2)%vf)

if (ib) then
Expand Down Expand Up @@ -593,6 +601,8 @@ contains
call s_pressure_relaxation_procedure(q_cons_ts(1)%vf)
end if

if (comp_debug) call s_comprehensive_debug(q_cons_ts(1)%vf, t_step, 2)

if (adv_n) call s_comp_alpha_from_n(q_cons_ts(1)%vf)

if (ib) then
Expand Down Expand Up @@ -703,6 +713,8 @@ contains
call s_pressure_relaxation_procedure(q_cons_ts(2)%vf)
end if

if (comp_debug) call s_comprehensive_debug(q_cons_ts(1)%vf, t_step, 1)

if (adv_n) call s_comp_alpha_from_n(q_cons_ts(2)%vf)

if (ib) then
Expand Down Expand Up @@ -778,6 +790,8 @@ contains
call s_pressure_relaxation_procedure(q_cons_ts(2)%vf)
end if

if (comp_debug) call s_comprehensive_debug(q_cons_ts(1)%vf, t_step, 2)

if (adv_n) call s_comp_alpha_from_n(q_cons_ts(2)%vf)

if (ib) then
Expand Down Expand Up @@ -852,6 +866,8 @@ contains
call s_pressure_relaxation_procedure(q_cons_ts(1)%vf)
end if

if (comp_debug) call s_comprehensive_debug(q_cons_ts(1)%vf, t_step, 3)

if (adv_n) call s_comp_alpha_from_n(q_cons_ts(1)%vf)

if (ib) then
Expand Down Expand Up @@ -1014,6 +1030,26 @@ contains

end subroutine s_apply_bodyforces

subroutine s_comprehensive_debug(q_cons_vf, t_step, stage)

type(scalar_field), dimension(sys_size) :: q_cons_vf
integer, intent(in) :: t_step, stage

integer :: errors

call s_check_cells(q_cons_vf, t_step, stage, errors)

if (errors /= 0) then
close (12)
call s_write_data_files(q_cons_vf, q_prim_vf, t_step)
call s_mpi_abort("Errors found in conservative variables")
end if

write (12, "(I3)") - 1
close (12)

end subroutine s_comprehensive_debug

!> This subroutine saves the temporary q_prim_vf vector
!! into the q_prim_ts vector that is then used in p_main
!! @param t_step current time-step
Expand Down Expand Up @@ -1053,6 +1089,7 @@ contains
end if

end subroutine s_time_step_cycling

!> Module deallocation and/or disassociation procedures
subroutine s_finalize_time_steppers_module

Expand Down
2 changes: 2 additions & 0 deletions toolchain/mfc/run/case_dicts.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ def analytic(self):
't_save': ParamType.REAL,
'cfl_target': ParamType.REAL,
'low_Mach': ParamType.INT,
'comp_debug': ParamType.LOG,
})

for var in [ 'advection', 'diffusion', 'reactions' ]:
Expand Down Expand Up @@ -328,6 +329,7 @@ def analytic(self):
't_save': ParamType.REAL,
't_stop': ParamType.REAL,
'n_start': ParamType.INT,
'comp_debug': ParamType.LOG,
})

for cmp_id in range(1,3+1):
Expand Down
Loading