From f875a9971dcc9dcb8489e64209274814f97cf2c1 Mon Sep 17 00:00:00 2001 From: Katelyn FitzGerald <7872563+kafitzgerald@users.noreply.github.com> Date: Fri, 11 Oct 2019 16:12:31 -0600 Subject: [PATCH] Fix to MPI for coupled WRF + WRF-Hydro (#384) * Remove MPI dup from if statement * Add additional checks for valid MPI_COMM * Check if HYDRO_COMM_WORLD is not MPI_COMM_NULL, and if so, skip re-duplicating it * Added error logging * Per the above, moved the error logging into the CPL_LAND module, and updated NoahMP driver to find it in the right place --- .../IO_code/module_NoahMP_hrldas_driver.F | 4 +-- trunk/NDHMS/MPP/CPL_WRF.F | 26 ++++++++++++++----- trunk/NDHMS/MPP/mpp_land.F | 21 +++++---------- 3 files changed, 29 insertions(+), 22 deletions(-) diff --git a/trunk/NDHMS/Land_models/NoahMP/IO_code/module_NoahMP_hrldas_driver.F b/trunk/NDHMS/Land_models/NoahMP/IO_code/module_NoahMP_hrldas_driver.F index e84410235..8b0632739 100644 --- a/trunk/NDHMS/Land_models/NoahMP/IO_code/module_NoahMP_hrldas_driver.F +++ b/trunk/NDHMS/Land_models/NoahMP/IO_code/module_NoahMP_hrldas_driver.F @@ -26,8 +26,8 @@ module module_NoahMP_hrldas_driver USE module_date_utilities #ifdef MPP_LAND use module_mpp_land, only: MPP_LAND_PAR_INI, mpp_land_init, getLocalXY, mpp_land_bcast_char, mpp_land_sync - use module_mpp_land, only: check_land, node_info, fatal_error_stop, numprocs - use module_cpl_land, only: cpl_land_init + use module_mpp_land, only: check_land, node_info, numprocs + use module_cpl_land, only: fatal_error_stop, cpl_land_init #endif #ifdef WRF_HYDRO use module_NWM_io, only: output_NoahMP_NWM diff --git a/trunk/NDHMS/MPP/CPL_WRF.F b/trunk/NDHMS/MPP/CPL_WRF.F index 6cfb5799d..5192c26eb 100644 --- a/trunk/NDHMS/MPP/CPL_WRF.F +++ b/trunk/NDHMS/MPP/CPL_WRF.F @@ -21,11 +21,12 @@ ! This is used as a coupler with the WRF model. MODULE MODULE_CPL_LAND - !use module_mpp_land, only: HYDRO_COMM_WORLD + use mpi + use, intrinsic :: iso_fortran_env, only: error_unit IMPLICIT NONE - integer, public :: HYDRO_COMM_WORLD = -1 + integer, public :: HYDRO_COMM_WORLD = MPI_COMM_NULL integer my_global_id integer total_pe_num @@ -56,7 +57,6 @@ MODULE MODULE_CPL_LAND subroutine CPL_LAND_INIT(istart,iend,jstart,jend) implicit none - include "mpif.h" integer ierr logical mpi_inited integer istart,iend,jstart,jend @@ -69,12 +69,17 @@ subroutine CPL_LAND_INIT(istart,iend,jstart,jend) CALL mpi_initialized( mpi_inited, ierr ) if ( .NOT. mpi_inited ) then - call mpi_init(ierr) - HYDRO_COMM_WORLD = MPI_COMM_WORLD + call mpi_init(ierr) + if (ierr /= MPI_SUCCESS) call fatal_error_stop("MPI Error: MPI_INIT failed") + endif + if (HYDRO_COMM_WORLD == MPI_COMM_NULL) then + call MPI_COMM_DUP(MPI_COMM_WORLD, HYDRO_COMM_WORLD, ierr) + if (ierr /= MPI_SUCCESS) call fatal_error_stop("MPI Error: MPI_COMM_DUP failed") endif call MPI_COMM_RANK( HYDRO_COMM_WORLD, my_global_id, ierr ) call MPI_COMM_SIZE( HYDRO_COMM_WORLD, total_pe_num, ierr ) + if (ierr /= MPI_SUCCESS) call fatal_error_stop("MPI Error: MPI_COMM_RANK and/or MPI_COMM_SIZE failed") allocate(node_info(9,total_pe_num)) @@ -126,7 +131,6 @@ END subroutine CPL_LAND_INIT subroutine send_info() implicit none - include "mpif.h" integer,allocatable,dimension(:,:) :: tmp_info integer ierr, i,size, tag integer mpp_status(MPI_STATUS_SIZE) @@ -222,4 +226,14 @@ subroutine find_down() return end subroutine find_down + ! stop the job due to the fatal error. + subroutine fatal_error_stop(msg) + character(len=*) :: msg + integer :: ierr + write(error_unit,*) "The job is stoped due to the fatal error. ", trim(msg) + call flush(error_unit) + CALL MPI_Abort(HYDRO_COMM_WORLD, 1, ierr) + call MPI_Finalize(ierr) + return + end subroutine fatal_error_stop END MODULE MODULE_CPL_LAND diff --git a/trunk/NDHMS/MPP/mpp_land.F b/trunk/NDHMS/MPP/mpp_land.F index 7e201941f..6415aec43 100644 --- a/trunk/NDHMS/MPP/mpp_land.F +++ b/trunk/NDHMS/MPP/mpp_land.F @@ -22,9 +22,9 @@ MODULE MODULE_MPP_LAND use MODULE_CPL_LAND - + use mpi + IMPLICIT NONE - include "mpif.h" !integer, public :: HYDRO_COMM_WORLD ! communicator for WRF-Hydro - moved to MODULE_CPL_LAND integer, public :: left_id,right_id,up_id,down_id,my_id integer, public :: left_right_np,up_down_np ! define total process in two dimensions. @@ -154,11 +154,16 @@ subroutine MPP_LAND_INIT() call mpi_initialized( mpi_inited, ierr ) if ( .not. mpi_inited ) then call MPI_INIT_THREAD( MPI_THREAD_FUNNELED, provided, ierr ) + if (ierr /= MPI_SUCCESS) call fatal_error_stop("MPI Error: MPI_INIT failed") + endif + if (HYDRO_COMM_WORLD == MPI_COMM_NULL) then call MPI_COMM_DUP(MPI_COMM_WORLD, HYDRO_COMM_WORLD, ierr) + if (ierr /= MPI_SUCCESS) call fatal_error_stop("MPI Error: MPI_COMM_DUP failed") endif call MPI_COMM_RANK( HYDRO_COMM_WORLD, my_id, ierr ) call MPI_COMM_SIZE( HYDRO_COMM_WORLD, numprocs, ierr ) + if (ierr /= MPI_SUCCESS) call fatal_error_stop("MPI Error: MPI_COMM_RANK and/or MPI_COMM_SIZE failed") ! create 2d logical mapping of the CPU. call log_map2d() @@ -2039,7 +2044,6 @@ end subroutine sum_real8 ! subroutine get_globalDim(ix,g_ix) ! implicit none ! integer ix,g_ix, ierr -! include "mpif.h" ! ! if ( my_id .eq. IO_id ) then ! g_ix = ix @@ -2278,17 +2282,6 @@ subroutine mpp_collect_1d_int_mem(nlinks,vinout) end subroutine mpp_collect_1d_int_mem -! stop the job due to the fatal error. - subroutine fatal_error_stop(msg) - character(len=*) :: msg - integer :: ierr - write(6,*) "The job is stoped due to the fatal error. ", trim(msg) - call flush(6) - call mpp_land_abort() - call MPI_finalize(ierr) - return - end subroutine fatal_error_stop - subroutine updateLake_seqInt(in,nsize,in0) implicit none integer :: nsize