From 494094a77b0b543b517dbfb15b1715b042944506 Mon Sep 17 00:00:00 2001 From: Paul Mullowney Date: Mon, 12 Apr 2021 10:31:46 -0600 Subject: [PATCH] Adding optimized multi GPU matrix/vector assembly. --- src/IJ_mv/HYPRE_IJMatrix.c | 84 +++ src/IJ_mv/HYPRE_IJVector.c | 84 +++ src/IJ_mv/HYPRE_IJ_mv.h | 70 ++- src/IJ_mv/IJMatrix_parcsr.c | 69 +++ src/IJ_mv/IJMatrix_parcsr_device.c | 893 +++++++++++++++++++---------- src/IJ_mv/IJVector_parcsr.c | 78 +++ src/IJ_mv/IJVector_parcsr_device.c | 342 ++++++++--- src/IJ_mv/_hypre_IJ_mv.h | 34 +- src/IJ_mv/aux_par_vector.c | 16 +- src/IJ_mv/aux_parcsr_matrix.c | 20 +- 10 files changed, 1267 insertions(+), 423 deletions(-) diff --git a/src/IJ_mv/HYPRE_IJMatrix.c b/src/IJ_mv/HYPRE_IJMatrix.c index 59f7ace089..067bced364 100644 --- a/src/IJ_mv/HYPRE_IJMatrix.c +++ b/src/IJ_mv/HYPRE_IJMatrix.c @@ -1037,6 +1037,90 @@ HYPRE_IJMatrixSetMaxOffProcElmts( HYPRE_IJMatrix matrix, * create IJMatrix on host memory *--------------------------------------------------------------------------*/ +HYPRE_Int +HYPRE_IJMatrixSetMaxOnProcElmts( HYPRE_IJMatrix matrix, + HYPRE_Int max_on_proc_elmts) +{ + hypre_IJMatrix *ijmatrix = (hypre_IJMatrix *) matrix; + + if (!ijmatrix) + { + hypre_error_in_arg(1); + return hypre_error_flag; + } + + if ( hypre_IJMatrixObjectType(ijmatrix) == HYPRE_PARCSR ) + { + return( hypre_IJMatrixSetMaxOnProcElmtsParCSR(ijmatrix, + max_on_proc_elmts) ); + } + else + { + hypre_error_in_arg(1); + } + + return hypre_error_flag; +} + +/*-------------------------------------------------------------------------- + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_IJMatrixSetOffProcSendElmts( HYPRE_IJMatrix matrix, + HYPRE_Int off_proc_send_elmts) +{ + hypre_IJMatrix *ijmatrix = (hypre_IJMatrix *) matrix; + + if (!ijmatrix) + { + hypre_error_in_arg(1); + return hypre_error_flag; + } + + if ( hypre_IJMatrixObjectType(ijmatrix) == HYPRE_PARCSR ) + { + return( hypre_IJMatrixSetOffProcSendElmtsParCSR(ijmatrix, + off_proc_send_elmts) ); + } + else + { + hypre_error_in_arg(1); + } + + return hypre_error_flag; +} + +/*-------------------------------------------------------------------------- + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_IJMatrixSetOffProcRecvElmts( HYPRE_IJMatrix matrix, + HYPRE_Int off_proc_recv_elmts) +{ + hypre_IJMatrix *ijmatrix = (hypre_IJMatrix *) matrix; + + if (!ijmatrix) + { + hypre_error_in_arg(1); + return hypre_error_flag; + } + + if ( hypre_IJMatrixObjectType(ijmatrix) == HYPRE_PARCSR ) + { + return( hypre_IJMatrixSetOffProcRecvElmtsParCSR(ijmatrix, + off_proc_recv_elmts) ); + } + else + { + hypre_error_in_arg(1); + } + + return hypre_error_flag; +} + +/*-------------------------------------------------------------------------- + *--------------------------------------------------------------------------*/ + HYPRE_Int HYPRE_IJMatrixRead( const char *filename, MPI_Comm comm, diff --git a/src/IJ_mv/HYPRE_IJVector.c b/src/IJ_mv/HYPRE_IJVector.c index ad3dd0391c..7151184ca3 100644 --- a/src/IJ_mv/HYPRE_IJVector.c +++ b/src/IJ_mv/HYPRE_IJVector.c @@ -438,6 +438,90 @@ HYPRE_IJVectorSetMaxOffProcElmts( HYPRE_IJVector vector, return hypre_error_flag; } +/*-------------------------------------------------------------------------- + * HYPRE_IJVectorSetMaxOnProcElmts + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_IJVectorSetMaxOnProcElmts( HYPRE_IJVector vector, + HYPRE_Int max_on_proc_elmts ) +{ + hypre_IJVector *vec = (hypre_IJVector *) vector; + + if (!vec) + { + hypre_error_in_arg(1); + return hypre_error_flag; + } + + if ( hypre_IJVectorObjectType(vec) == HYPRE_PARCSR ) + { + return( hypre_IJVectorSetMaxOnProcElmtsPar(vec, max_on_proc_elmts)); + } + else + { + hypre_error_in_arg(1); + } + + return hypre_error_flag; +} + +/*-------------------------------------------------------------------------- + * HYPRE_IJVectorSetOffProcSendElmts + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_IJVectorSetOffProcSendElmts( HYPRE_IJVector vector, + HYPRE_Int off_proc_send_elmts ) +{ + hypre_IJVector *vec = (hypre_IJVector *) vector; + + if (!vec) + { + hypre_error_in_arg(1); + return hypre_error_flag; + } + + if ( hypre_IJVectorObjectType(vec) == HYPRE_PARCSR ) + { + return( hypre_IJVectorSetOffProcSendElmtsPar(vec, off_proc_send_elmts)); + } + else + { + hypre_error_in_arg(1); + } + + return hypre_error_flag; +} + +/*-------------------------------------------------------------------------- + * HYPRE_IJVectorSetOffProcRecvElmts + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_IJVectorSetOffProcRecvElmts( HYPRE_IJVector vector, + HYPRE_Int off_proc_recv_elmts ) +{ + hypre_IJVector *vec = (hypre_IJVector *) vector; + + if (!vec) + { + hypre_error_in_arg(1); + return hypre_error_flag; + } + + if ( hypre_IJVectorObjectType(vec) == HYPRE_PARCSR ) + { + return( hypre_IJVectorSetOffProcRecvElmtsPar(vec, off_proc_recv_elmts)); + } + else + { + hypre_error_in_arg(1); + } + + return hypre_error_flag; +} + /*-------------------------------------------------------------------------- * HYPRE_IJVectorSetObjectType *--------------------------------------------------------------------------*/ diff --git a/src/IJ_mv/HYPRE_IJ_mv.h b/src/IJ_mv/HYPRE_IJ_mv.h index 19da941211..16089329a1 100644 --- a/src/IJ_mv/HYPRE_IJ_mv.h +++ b/src/IJ_mv/HYPRE_IJ_mv.h @@ -297,6 +297,40 @@ HYPRE_Int HYPRE_IJMatrixSetDiagOffdSizes(HYPRE_IJMatrix matrix, HYPRE_Int HYPRE_IJMatrixSetMaxOffProcElmts(HYPRE_IJMatrix matrix, HYPRE_Int max_off_proc_elmts); +/** + * (Optional) Sets the maximum number of elements that are expected to be set + * (or added) on this processor from this processor + * This routine can significantly improve the efficiency of matrix + * construction, and should always be utilized if possible. + * + * Not collective. + **/ +HYPRE_Int HYPRE_IJMatrixSetMaxOnProcElmts(HYPRE_IJMatrix matrix, + HYPRE_Int max_on_proc_elmts); + +/** + * (Optional) Sets the exact number of elements that are expected to be set + * added on other processors from this processor. + * This routine can significantly improve the efficiency of matrix + * construction, and should always be utilized if possible. + * + * Not collective. + **/ +HYPRE_Int HYPRE_IJMatrixSetOffProcSendElmts(HYPRE_IJMatrix matrix, + HYPRE_Int off_proc_send_elmts); + + +/** + * (Optional) Sets the exact number of elements that are expected to be set + * added on this processor from other processors + * This routine can significantly improve the efficiency of matrix + * construction, and should always be utilized if possible. + * + * Not collective. + **/ +HYPRE_Int HYPRE_IJMatrixSetOffProcRecvElmts(HYPRE_IJMatrix matrix, + HYPRE_Int off_proc_recv_elmts); + /** * (Optional) Sets the print level, if the user wants to print * error messages. The default is 0, i.e. no error messages are printed. @@ -399,7 +433,7 @@ HYPRE_Int HYPRE_IJVectorInitialize_v2( HYPRE_IJVector vector, HYPRE_MemoryLocati /** * (Optional) Sets the maximum number of elements that are expected to be set * (or added) on other processors from this processor - * This routine can significantly improve the efficiency of matrix + * This routine can significantly improve the efficiency of vector * construction, and should always be utilized if possible. * * Not collective. @@ -407,6 +441,40 @@ HYPRE_Int HYPRE_IJVectorInitialize_v2( HYPRE_IJVector vector, HYPRE_MemoryLocati HYPRE_Int HYPRE_IJVectorSetMaxOffProcElmts(HYPRE_IJVector vector, HYPRE_Int max_off_proc_elmts); +/** + * (Optional) Sets the maximum number of elements that are expected to be set + * (or added) on this processor from this processor + * This routine can significantly improve the efficiency of vector + * construction, and should always be utilized if possible. + * + * Not collective. + **/ +HYPRE_Int HYPRE_IJVectorSetMaxOnProcElmts(HYPRE_IJVector vector, + HYPRE_Int max_on_proc_elmts); + +/** + * (Optional) Sets the exact number of elements that are expected to be set + * added on other processors from this processor. + * This routine can significantly improve the efficiency of vector + * construction, and should always be utilized if possible. + * + * Not collective. + **/ +HYPRE_Int HYPRE_IJVectorSetOffProcSendElmts(HYPRE_IJVector vector, + HYPRE_Int off_proc_send_elmts); + + +/** + * (Optional) Sets the exact number of elements that are expected to be set + * added on this processor from other processors + * This routine can significantly improve the efficiency of vector + * construction, and should always be utilized if possible. + * + * Not collective. + **/ +HYPRE_Int HYPRE_IJVectorSetOffProcRecvElmts(HYPRE_IJVector vector, + HYPRE_Int off_proc_recv_elmts); + /** * Sets values in vector. The arrays \e values and \e indices * are of dimension \e nvalues and contain the vector values to be diff --git a/src/IJ_mv/IJMatrix_parcsr.c b/src/IJ_mv/IJMatrix_parcsr.c index f24d660f96..d393515bcf 100644 --- a/src/IJ_mv/IJMatrix_parcsr.c +++ b/src/IJ_mv/IJMatrix_parcsr.c @@ -179,6 +179,7 @@ hypre_IJMatrixSetDiagOffdSizesParCSR(hypre_IJMatrix *matrix, return hypre_error_flag; } + /****************************************************************************** * * hypre_IJMatrixSetMaxOnProcElmtsParCSR @@ -246,6 +247,74 @@ hypre_IJMatrixSetMaxOffProcElmtsParCSR(hypre_IJMatrix *matrix, return hypre_error_flag; } + +/****************************************************************************** + * + * hypre_IJMatrixSetOffProcSendElmtsParCSR + * + *****************************************************************************/ + +HYPRE_Int +hypre_IJMatrixSetOffProcSendElmtsParCSR(hypre_IJMatrix *matrix, + HYPRE_Int off_proc_send_elmts) +{ + hypre_AuxParCSRMatrix *aux_matrix; + HYPRE_Int local_num_rows, local_num_cols, my_id; + HYPRE_BigInt *row_partitioning = hypre_IJMatrixRowPartitioning(matrix); + HYPRE_BigInt *col_partitioning = hypre_IJMatrixColPartitioning(matrix); + MPI_Comm comm = hypre_IJMatrixComm(matrix); + + hypre_MPI_Comm_rank(comm,&my_id); + aux_matrix = (hypre_AuxParCSRMatrix *) hypre_IJMatrixTranslator(matrix); + if (!aux_matrix) + { + local_num_rows = (HYPRE_Int)(row_partitioning[1]-row_partitioning[0]); + local_num_cols = (HYPRE_Int)(col_partitioning[1]-col_partitioning[0]); + hypre_AuxParCSRMatrixCreate(&aux_matrix, local_num_rows, + local_num_cols, NULL); + hypre_IJMatrixTranslator(matrix) = aux_matrix; + } +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + hypre_AuxParCSRMatrixUsrOffProcSendElmts(aux_matrix) = off_proc_send_elmts; + hypre_AuxParCSRMatrixUsrElmtsFilled(aux_matrix) = 0; +#endif + return hypre_error_flag; +} + + +/****************************************************************************** + * + * hypre_IJMatrixSetOffProcRecvElmtsParCSR + * + *****************************************************************************/ + +HYPRE_Int +hypre_IJMatrixSetOffProcRecvElmtsParCSR(hypre_IJMatrix *matrix, + HYPRE_Int off_proc_recv_elmts) +{ + hypre_AuxParCSRMatrix *aux_matrix; + HYPRE_Int local_num_rows, local_num_cols, my_id; + HYPRE_BigInt *row_partitioning = hypre_IJMatrixRowPartitioning(matrix); + HYPRE_BigInt *col_partitioning = hypre_IJMatrixColPartitioning(matrix); + MPI_Comm comm = hypre_IJMatrixComm(matrix); + + hypre_MPI_Comm_rank(comm,&my_id); + aux_matrix = (hypre_AuxParCSRMatrix *) hypre_IJMatrixTranslator(matrix); + if (!aux_matrix) + { + local_num_rows = (HYPRE_Int)(row_partitioning[1]-row_partitioning[0]); + local_num_cols = (HYPRE_Int)(col_partitioning[1]-col_partitioning[0]); + hypre_AuxParCSRMatrixCreate(&aux_matrix, local_num_rows, + local_num_cols, NULL); + hypre_IJMatrixTranslator(matrix) = aux_matrix; + } +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + hypre_AuxParCSRMatrixUsrOffProcRecvElmts(aux_matrix) = off_proc_recv_elmts; + hypre_AuxParCSRMatrixUsrElmtsFilled(aux_matrix) = 0; +#endif + return hypre_error_flag; +} + /****************************************************************************** * * hypre_IJMatrixInitializeParCSR diff --git a/src/IJ_mv/IJMatrix_parcsr_device.c b/src/IJ_mv/IJMatrix_parcsr_device.c index 7253244a8c..2abbc56f04 100644 --- a/src/IJ_mv/IJMatrix_parcsr_device.c +++ b/src/IJ_mv/IJMatrix_parcsr_device.c @@ -98,86 +98,127 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, hypre_IJMatrixTranslator(matrix) = aux_matrix; } - HYPRE_Int stack_elmts_max = hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix); HYPRE_Int stack_elmts_current = hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix); - HYPRE_Int stack_elmts_required = stack_elmts_current + nelms; HYPRE_BigInt *stack_i = hypre_AuxParCSRMatrixStackI(aux_matrix); HYPRE_BigInt *stack_j = hypre_AuxParCSRMatrixStackJ(aux_matrix); HYPRE_Complex *stack_data = hypre_AuxParCSRMatrixStackData(aux_matrix); - char *stack_sora = hypre_AuxParCSRMatrixStackSorA(aux_matrix); + char *stack_sora = hypre_AuxParCSRMatrixStackSorA(aux_matrix); - if ( stack_elmts_max < stack_elmts_required ) + /* Each of these parameters must be set to enter this simplified path */ + if (hypre_AuxParCSRMatrixUsrOnProcElmts(aux_matrix)>=0 && + hypre_AuxParCSRMatrixUsrOffProcSendElmts(aux_matrix)>=0 && + hypre_AuxParCSRMatrixUsrOffProcRecvElmts(aux_matrix)>=0) { - HYPRE_Int stack_elmts_max_new = hypre_max(hypre_AuxParCSRMatrixUsrOnProcElmts (aux_matrix), 0) + - hypre_max(hypre_AuxParCSRMatrixUsrOffProcElmts(aux_matrix), 0); - if ( hypre_AuxParCSRMatrixUsrOnProcElmts (aux_matrix) < 0 || - hypre_AuxParCSRMatrixUsrOffProcElmts(aux_matrix) < 0 ) + HYPRE_Int stack_elmts_on_proc = hypre_AuxParCSRMatrixUsrOnProcElmts(aux_matrix); + HYPRE_Int stack_elmts_off_proc_send = hypre_AuxParCSRMatrixUsrOffProcSendElmts(aux_matrix); + char usr_elmts_filled = hypre_AuxParCSRMatrixUsrElmtsFilled(aux_matrix); + + /* Neither has been filled ... allocate space */ + if (!stack_i && !stack_j && !stack_data) { + hypre_AuxParCSRMatrixStackI(aux_matrix) = stack_i = const_cast(rows); + hypre_AuxParCSRMatrixStackJ(aux_matrix) = stack_j = const_cast(cols); + hypre_AuxParCSRMatrixStackData(aux_matrix) = stack_data = const_cast(values); + } + + /* grab the current version of the stack size ... used for SetAddValues called from IJAssemble */ + if (SorA==1 && usr_elmts_filled==0) { + /* set the current stack position to 0 and set the filled flag to 1 as this routine is going to fill */ + hypre_assert(stack_elmts_on_proc == nelms); + hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix) = stack_elmts_current = 0; + } + else if (SorA==0 && usr_elmts_filled==0) { - stack_elmts_max_new = hypre_max(num_local_rows * hypre_AuxParCSRMatrixInitAllocFactor(aux_matrix), stack_elmts_max_new); - stack_elmts_max_new = hypre_max(stack_elmts_max * hypre_AuxParCSRMatrixGrowFactor(aux_matrix), stack_elmts_max_new); + /* set the current stack position to stack_elmts_on_proc + stack_elmts_off_proc_recv; and set the corresponding filled flag to 1 as this routine is going to fill */ + hypre_assert(stack_elmts_off_proc_send == nelms); + hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix) = stack_elmts_current = stack_elmts_on_proc; } - stack_elmts_max_new = hypre_max(stack_elmts_required, stack_elmts_max_new); - hypre_AuxParCSRMatrixStackI(aux_matrix) = stack_i = hypre_TReAlloc_v2(stack_i, HYPRE_BigInt, stack_elmts_max, HYPRE_BigInt, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); - hypre_AuxParCSRMatrixStackJ(aux_matrix) = stack_j = hypre_TReAlloc_v2(stack_j, HYPRE_BigInt, stack_elmts_max, HYPRE_BigInt, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); - hypre_AuxParCSRMatrixStackData(aux_matrix) = stack_data = hypre_TReAlloc_v2(stack_data, HYPRE_Complex, stack_elmts_max, HYPRE_Complex, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); - hypre_AuxParCSRMatrixStackSorA(aux_matrix) = stack_sora = hypre_TReAlloc_v2(stack_sora, char, stack_elmts_max, char, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); - hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix) = stack_elmts_max_new; + if (usr_elmts_filled) { + hypre_TMemcpy(stack_i + stack_elmts_current, rows, HYPRE_BigInt, nelms, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(stack_j + stack_elmts_current, cols, HYPRE_BigInt, nelms, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(stack_data + stack_elmts_current, values, HYPRE_Complex, nelms, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + } } + else + { + HYPRE_Int stack_elmts_max = hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix); + HYPRE_Int stack_elmts_required = stack_elmts_current + nelms; + + if ( stack_elmts_max < stack_elmts_required ) + { + HYPRE_Int stack_elmts_max_new = hypre_max(hypre_AuxParCSRMatrixUsrOnProcElmts (aux_matrix), 0) + + hypre_max(hypre_AuxParCSRMatrixUsrOffProcElmts(aux_matrix), 0); + if ( hypre_AuxParCSRMatrixUsrOnProcElmts (aux_matrix) < 0 || + hypre_AuxParCSRMatrixUsrOffProcElmts(aux_matrix) < 0 ) + { + stack_elmts_max_new = hypre_max(num_local_rows * hypre_AuxParCSRMatrixInitAllocFactor(aux_matrix), stack_elmts_max_new); + stack_elmts_max_new = hypre_max(stack_elmts_max * hypre_AuxParCSRMatrixGrowFactor(aux_matrix), stack_elmts_max_new); + } + stack_elmts_max_new = hypre_max(stack_elmts_required, stack_elmts_max_new); - HYPRE_THRUST_CALL(fill_n, stack_sora + stack_elmts_current, nelms, SorA); + hypre_AuxParCSRMatrixStackI(aux_matrix) = stack_i = hypre_TReAlloc_v2(stack_i, HYPRE_BigInt, stack_elmts_max, HYPRE_BigInt, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); + hypre_AuxParCSRMatrixStackJ(aux_matrix) = stack_j = hypre_TReAlloc_v2(stack_j, HYPRE_BigInt, stack_elmts_max, HYPRE_BigInt, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); + hypre_AuxParCSRMatrixStackData(aux_matrix) = stack_data = hypre_TReAlloc_v2(stack_data, HYPRE_Complex, stack_elmts_max, HYPRE_Complex, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); + hypre_AuxParCSRMatrixStackSorA(aux_matrix) = stack_sora = hypre_TReAlloc_v2(stack_sora, char, stack_elmts_max, char, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); + hypre_AuxParCSRMatrixMaxStackElmts(aux_matrix) = stack_elmts_max_new; + } - if (ncols) - { - hypreDevice_CsrRowPtrsToIndicesWithRowNum(nrows, nelms, row_ptr, (HYPRE_BigInt *) rows, stack_i + stack_elmts_current); - } - else - { - hypre_TMemcpy(stack_i + stack_elmts_current, rows, HYPRE_BigInt, nelms, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - } - if (row_indexes) - { - HYPRE_Int len, len1; - hypre_TMemcpy(&len1, &row_indexes[nrows-1], HYPRE_Int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); + HYPRE_THRUST_CALL(fill_n, stack_sora + stack_elmts_current, nelms, SorA); + if (ncols) { - hypre_TMemcpy(&len, &ncols[nrows-1], HYPRE_Int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); + hypreDevice_CsrRowPtrsToIndicesWithRowNum(nrows, nelms, row_ptr, (HYPRE_BigInt *) rows, stack_i + stack_elmts_current); } else { - len = 1; + hypre_TMemcpy(stack_i + stack_elmts_current, rows, HYPRE_BigInt, nelms, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + } + + if (row_indexes) + { + HYPRE_Int len, len1; + hypre_TMemcpy(&len1, &row_indexes[nrows-1], HYPRE_Int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); + if (ncols) + { + hypre_TMemcpy(&len, &ncols[nrows-1], HYPRE_Int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); + } + else + { + len = 1; + } + /* this is the *effective* length of cols and values */ + len += len1; + HYPRE_Int *indicator = hypre_CTAlloc(HYPRE_Int, len, HYPRE_MEMORY_DEVICE); + hypreDevice_CsrRowPtrsToIndices_v2(nrows-1, len1, (HYPRE_Int *) row_indexes, indicator); + /* mark unwanted elements as -1 */ + dim3 bDim = hypre_GetDefaultCUDABlockDimension(); + dim3 gDim = hypre_GetDefaultCUDAGridDimension(len1, "thread", bDim); + HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator, (HYPRE_Int *) row_indexes, ncols, indicator ); + + auto new_end = HYPRE_THRUST_CALL( + copy_if, + thrust::make_zip_iterator(thrust::make_tuple(cols, values)), + thrust::make_zip_iterator(thrust::make_tuple(cols + len, values + len)), + indicator, + thrust::make_zip_iterator(thrust::make_tuple(stack_j + stack_elmts_current, + stack_data + stack_elmts_current)), + is_nonnegative() ); + + HYPRE_Int nnz_tmp = thrust::get<0>(new_end.get_iterator_tuple()) - (stack_j + stack_elmts_current); + + hypre_assert(nnz_tmp == nelms); + + hypre_TFree(indicator, HYPRE_MEMORY_DEVICE); + } + else + { + hypre_TMemcpy(stack_j + stack_elmts_current, cols, HYPRE_BigInt, nelms, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(stack_data + stack_elmts_current, values, HYPRE_Complex, nelms, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); } - /* this is the *effective* length of cols and values */ - len += len1; - HYPRE_Int *indicator = hypre_CTAlloc(HYPRE_Int, len, HYPRE_MEMORY_DEVICE); - hypreDevice_CsrRowPtrsToIndices_v2(nrows-1, len1, (HYPRE_Int *) row_indexes, indicator); - /* mark unwanted elements as -1 */ - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(len1, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator, (HYPRE_Int *) row_indexes, ncols, indicator ); - - auto new_end = HYPRE_THRUST_CALL( - copy_if, - thrust::make_zip_iterator(thrust::make_tuple(cols, values)), - thrust::make_zip_iterator(thrust::make_tuple(cols + len, values + len)), - indicator, - thrust::make_zip_iterator(thrust::make_tuple(stack_j + stack_elmts_current, - stack_data + stack_elmts_current)), - is_nonnegative() ); - - HYPRE_Int nnz_tmp = thrust::get<0>(new_end.get_iterator_tuple()) - (stack_j + stack_elmts_current); - - hypre_assert(nnz_tmp == nelms); - - hypre_TFree(indicator, HYPRE_MEMORY_DEVICE); - } - else - { - hypre_TMemcpy(stack_j + stack_elmts_current, cols, HYPRE_BigInt, nelms, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(stack_data + stack_elmts_current, values, HYPRE_Complex, nelms, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); } + hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix) += nelms; hypre_TFree(row_ptr, HYPRE_MEMORY_DEVICE); @@ -185,6 +226,44 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, return hypre_error_flag; } +/* helper routine used in hypre_IJMatrixAssembleParCSRDevice: + * 1. sort A0 with key (I0, J0) + * [put the diagonal first; see the comments in hypre_cuda_utils.c] + * 2. reduce A0 [with sum] + * N0: input size; N1: size after reduction (<= N0) + * Note: (I1, J1, A1) are not resized to N1 but have size N0 + */ + +HYPRE_Int +hypre_IJMatrixAssembleSortAndReduce0(HYPRE_BigInt row_start, HYPRE_Int NUM_ROWS, HYPRE_Int GLOBAL_NUM_ROWS, HYPRE_Int GLOBAL_NUM_COLS, + HYPRE_Int NNZ_OWNED, HYPRE_Int N0, HYPRE_BigInt *I0, HYPRE_BigInt *J0, HYPRE_Complex *A0, + HYPRE_Int *N1, HYPRE_BigInt **I1, HYPRE_BigInt **J1, HYPRE_Complex **A1 ) +{ + hypreDevice_StableSortByTupleKey(N0, I0, J0, A0, 2); + + HYPRE_BigInt *I = hypre_TAlloc(HYPRE_BigInt, N0, HYPRE_MEMORY_DEVICE); + HYPRE_BigInt *J = hypre_TAlloc(HYPRE_BigInt, N0, HYPRE_MEMORY_DEVICE); + HYPRE_Complex *A = hypre_TAlloc(HYPRE_Complex, N0, HYPRE_MEMORY_DEVICE); + + auto new_end = HYPRE_THRUST_CALL( + reduce_by_key, + thrust::make_zip_iterator(thrust::make_tuple(I0, J0 )), /* keys_first */ + thrust::make_zip_iterator(thrust::make_tuple(I0 + N0, J0 + N0)), /* keys_last */ + A0, /* values_first */ + thrust::make_zip_iterator(thrust::make_tuple(I, J )), /* keys_output */ + A, /* values_output */ + thrust::equal_to< thrust::tuple >(), /* binary_pred */ + thrust::plus() /* binary_op */); + + *N1 = new_end.second - A; + HYPRE_THRUST_CALL( transform, I, I + *N1, I, _1 - row_start ); + *I1 = I; + *J1 = J; + *A1 = A; + + return hypre_error_flag; +} + template struct hypre_IJMatrixAssembleFunctor : public thrust::binary_function< thrust::tuple, thrust::tuple, thrust::tuple > { @@ -372,6 +451,7 @@ hypre_IJMatrixAssembleSortAndRemove(HYPRE_Int N0, HYPRE_BigInt *I0, HYPRE_BigInt } #endif + HYPRE_Int hypre_IJMatrixAssembleParCSRDevice(hypre_IJMatrix *matrix) { @@ -404,322 +484,499 @@ hypre_IJMatrixAssembleParCSRDevice(hypre_IJMatrix *matrix) HYPRE_Complex *stack_data = hypre_AuxParCSRMatrixStackData(aux_matrix); char *stack_sora = hypre_AuxParCSRMatrixStackSorA(aux_matrix); - in_range pred(row_start, row_end-1); - HYPRE_Int nelms_on = HYPRE_THRUST_CALL(count_if, stack_i, stack_i+nelms, pred); - HYPRE_Int nelms_off = nelms - nelms_on; - HYPRE_Int nelms_off_max; - hypre_MPI_Allreduce(&nelms_off, &nelms_off_max, 1, HYPRE_MPI_INT, hypre_MPI_MAX, comm); - /* communicate for aux off-proc and add to remote aux on-proc */ - if (nelms_off_max) - { - HYPRE_Int new_nnz = 0; - HYPRE_BigInt *new_i = NULL; - HYPRE_BigInt *new_j = NULL; - HYPRE_Complex *new_data = NULL; + HYPRE_Int nelms_off_send = hypre_AuxParCSRMatrixUsrOffProcSendElmts(aux_matrix); + HYPRE_Int nelms_off_recv = hypre_AuxParCSRMatrixUsrOffProcRecvElmts(aux_matrix); + HYPRE_Int nelms_on = hypre_AuxParCSRMatrixUsrOnProcElmts(aux_matrix); - if (nelms_off) + if (nelms_on>=0 && nelms_off_recv>=0 && nelms_off_send>=0) + { + if (nelms_off_send>=0 || nelms_off_recv>=0) { - /* copy off-proc entries out of stack and remove from stack */ - HYPRE_BigInt *off_proc_i = hypre_TAlloc(HYPRE_BigInt, nelms_off, HYPRE_MEMORY_DEVICE); - HYPRE_BigInt *off_proc_j = hypre_TAlloc(HYPRE_BigInt, nelms_off, HYPRE_MEMORY_DEVICE); - HYPRE_Complex *off_proc_data = hypre_TAlloc(HYPRE_Complex, nelms_off, HYPRE_MEMORY_DEVICE); - char *off_proc_sora = hypre_TAlloc(char, nelms_off, HYPRE_MEMORY_DEVICE); - char *is_on_proc = hypre_TAlloc(char, nelms, HYPRE_MEMORY_DEVICE); - - HYPRE_THRUST_CALL(transform, stack_i, stack_i + nelms, is_on_proc, pred); - - auto new_end1 = HYPRE_THRUST_CALL( - copy_if, - thrust::make_zip_iterator(thrust::make_tuple(stack_i, stack_j, stack_data, stack_sora )), /* first */ - thrust::make_zip_iterator(thrust::make_tuple(stack_i + nelms, stack_j + nelms, stack_data + nelms, stack_sora + nelms)), /* last */ - is_on_proc, /* stencil */ - thrust::make_zip_iterator(thrust::make_tuple(off_proc_i, off_proc_j, off_proc_data, off_proc_sora)), /* result */ - thrust::not1(thrust::identity()) ); + HYPRE_Int new_nnz = 0; //nelms_off_send; + HYPRE_BigInt *new_i = NULL; + HYPRE_BigInt *new_j = NULL; + HYPRE_Complex *new_data = NULL; - hypre_assert(thrust::get<0>(new_end1.get_iterator_tuple()) - off_proc_i == nelms_off); + if (nelms_off_send>0) + { + /* set the pointers */ + new_nnz = nelms_off_send; + new_i = stack_i + nelms_on; + new_j = stack_j + nelms_on; + new_data = stack_data + nelms_on; + } - /* remove off-proc entries from stack */ - auto new_end2 = HYPRE_THRUST_CALL( - remove_if, - thrust::make_zip_iterator(thrust::make_tuple(stack_i, stack_j, stack_data, stack_sora )), /* first */ - thrust::make_zip_iterator(thrust::make_tuple(stack_i + nelms, stack_j + nelms, stack_data + nelms, stack_sora + nelms)), /* last */ - is_on_proc, /* stencil */ - thrust::not1(thrust::identity()) ); + /* reset this */ + hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix) = nelms_on; - hypre_assert(thrust::get<0>(new_end2.get_iterator_tuple()) - stack_i == nelms_on); + /* enforce this here */ + hypre_AuxParCSRMatrixUsrElmtsFilled(aux_matrix) = 1; - hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix) = nelms_on; + /* send new_i/j/data to remote processes and the receivers call addtovalues */ + hypre_IJMatrixAssembleOffProcValsParCSR(matrix, -1, -1, new_nnz, HYPRE_MEMORY_DEVICE, new_i, new_j, new_data); + } - hypre_TFree(is_on_proc, HYPRE_MEMORY_DEVICE); + /* Note: the stack might have been changed in hypre_IJMatrixAssembleOffProcValsParCSR, + * so must get the size and the pointers again */ + nelms = hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix); + stack_i = hypre_AuxParCSRMatrixStackI(aux_matrix); + stack_j = hypre_AuxParCSRMatrixStackJ(aux_matrix); + stack_data = hypre_AuxParCSRMatrixStackData(aux_matrix); + + if (nelms) + { + HYPRE_Int new_nnz; + HYPRE_BigInt *new_i; + HYPRE_BigInt *new_j; + HYPRE_Complex *new_data; /* sort and reduce */ - hypre_IJMatrixAssembleSortAndReduce3(nelms_off, off_proc_i, off_proc_j, off_proc_sora, off_proc_data, &new_nnz, &new_i, &new_j, &new_data); - // new_nnz = hypre_IJMatrixAssembleSortAndRemove(nelms_off, off_proc_i, off_proc_j, off_proc_sora, off_proc_data); + HYPRE_BigInt global_num_rows = hypre_ParCSRMatrixGlobalNumRows(par_matrix); + HYPRE_BigInt global_num_cols = hypre_ParCSRMatrixGlobalNumCols(par_matrix); + hypre_IJMatrixAssembleSortAndReduce0(row_start, nrows, global_num_rows, global_num_cols, nelms_on, + nelms, stack_i, stack_j, stack_data, &new_nnz, &new_i, &new_j, &new_data); + + HYPRE_Int num_cols_offd_new; + HYPRE_BigInt *col_map_offd_new; + HYPRE_Int *col_map_offd_map; + HYPRE_Int diag_nnz_new; + HYPRE_Int *diag_i_new = NULL; + HYPRE_Int *diag_j_new = NULL; + HYPRE_Complex *diag_a_new = NULL; + HYPRE_Int offd_nnz_new; + HYPRE_Int *offd_i_new = NULL; + HYPRE_Int *offd_j_new = NULL; + HYPRE_Complex *offd_a_new = NULL; + + hypre_CSRMatrixSplitDevice_core( 0, + nrows, + new_nnz, + NULL, + new_j, + NULL, + NULL, + col_start, + col_end - 1, + hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(par_matrix)), + NULL, + NULL, + NULL, + NULL, + &diag_nnz_new, + NULL, + NULL, + NULL, + NULL, + &offd_nnz_new, + NULL, + NULL, + NULL, + NULL ); + + if (diag_nnz_new) + { + diag_i_new = hypre_TAlloc(HYPRE_Int, diag_nnz_new, HYPRE_MEMORY_DEVICE); + diag_j_new = hypre_TAlloc(HYPRE_Int, diag_nnz_new, HYPRE_MEMORY_DEVICE); + diag_a_new = hypre_TAlloc(HYPRE_Complex, diag_nnz_new, HYPRE_MEMORY_DEVICE); + } - hypre_TFree(off_proc_i, HYPRE_MEMORY_DEVICE); - hypre_TFree(off_proc_j, HYPRE_MEMORY_DEVICE); - hypre_TFree(off_proc_data, HYPRE_MEMORY_DEVICE); - hypre_TFree(off_proc_sora, HYPRE_MEMORY_DEVICE); - } + if (offd_nnz_new) + { + offd_i_new = hypre_TAlloc(HYPRE_Int, offd_nnz_new, HYPRE_MEMORY_DEVICE); + offd_j_new = hypre_TAlloc(HYPRE_Int, offd_nnz_new, HYPRE_MEMORY_DEVICE); + offd_a_new = hypre_TAlloc(HYPRE_Complex, offd_nnz_new, HYPRE_MEMORY_DEVICE); + } - /* send new_i/j/data to remote processes and the receivers call addtovalues */ - hypre_IJMatrixAssembleOffProcValsParCSR(matrix, -1, -1, new_nnz, HYPRE_MEMORY_DEVICE, new_i, new_j, new_data); + /* split IJ into diag and offd */ + hypre_CSRMatrixSplitDevice_core( 1, + nrows, + new_nnz, + new_i, + new_j, + new_data, + NULL, + col_start, + col_end - 1, + hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(par_matrix)), + hypre_ParCSRMatrixDeviceColMapOffd(par_matrix), + &col_map_offd_map, + &num_cols_offd_new, + &col_map_offd_new, + &diag_nnz_new, + diag_i_new, + diag_j_new, + diag_a_new, + NULL, + &offd_nnz_new, + offd_i_new, + offd_j_new, + offd_a_new, + NULL ); + + hypre_TFree(new_i, HYPRE_MEMORY_DEVICE); + hypre_TFree(new_j, HYPRE_MEMORY_DEVICE); + hypre_TFree(new_data, HYPRE_MEMORY_DEVICE); + + if (diag_nnz_new > 0) + { + hypre_CSRMatrix *diag = hypre_CSRMatrixCreate(nrows, ncols, diag_nnz_new); + hypre_CSRMatrixI(diag) = hypreDevice_CsrRowIndicesToPtrs(nrows, diag_nnz_new, diag_i_new); + hypre_CSRMatrixJ(diag) = diag_j_new; + hypre_CSRMatrixData(diag) = diag_a_new; + hypre_CSRMatrixMemoryLocation(diag) = HYPRE_MEMORY_DEVICE; + + hypre_TFree(diag_i_new, HYPRE_MEMORY_DEVICE); + + hypre_CSRMatrixDestroy(hypre_ParCSRMatrixDiag(par_matrix)); + hypre_ParCSRMatrixDiag(par_matrix) = diag; + } + if (offd_nnz_new > 0) + { + hypre_CSRMatrix *offd = hypre_CSRMatrixCreate(nrows, num_cols_offd_new, offd_nnz_new); + hypre_CSRMatrixI(offd) = hypreDevice_CsrRowIndicesToPtrs(nrows, offd_nnz_new, offd_i_new); + hypre_CSRMatrixJ(offd) = offd_j_new; + hypre_CSRMatrixData(offd) = offd_a_new; + hypre_CSRMatrixMemoryLocation(offd) = HYPRE_MEMORY_DEVICE; - hypre_TFree(new_i, HYPRE_MEMORY_DEVICE); - hypre_TFree(new_j, HYPRE_MEMORY_DEVICE); - hypre_TFree(new_data, HYPRE_MEMORY_DEVICE); - } + hypre_TFree(offd_i_new, HYPRE_MEMORY_DEVICE); - /* Note: the stack might have been changed in hypre_IJMatrixAssembleOffProcValsParCSR, - * so must get the size and the pointers again */ - nelms = hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix); - stack_i = hypre_AuxParCSRMatrixStackI(aux_matrix); - stack_j = hypre_AuxParCSRMatrixStackJ(aux_matrix); - stack_data = hypre_AuxParCSRMatrixStackData(aux_matrix); - stack_sora = hypre_AuxParCSRMatrixStackSorA(aux_matrix); + hypre_CSRMatrixDestroy(hypre_ParCSRMatrixOffd(par_matrix)); + hypre_ParCSRMatrixOffd(par_matrix) = offd; -#ifdef HYPRE_DEBUG - /* the stack should only have on-proc elements now */ - HYPRE_Int tmp = HYPRE_THRUST_CALL(count_if, stack_i, stack_i+nelms, pred); - hypre_assert(nelms == tmp); -#endif + hypre_TFree(hypre_ParCSRMatrixDeviceColMapOffd(par_matrix), HYPRE_MEMORY_DEVICE); + hypre_ParCSRMatrixDeviceColMapOffd(par_matrix) = col_map_offd_new; - if (nelms) + hypre_TFree(hypre_ParCSRMatrixColMapOffd(par_matrix), HYPRE_MEMORY_HOST); + hypre_ParCSRMatrixColMapOffd(par_matrix) = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_new, HYPRE_MEMORY_HOST); + hypre_TMemcpy(hypre_ParCSRMatrixColMapOffd(par_matrix), col_map_offd_new, HYPRE_BigInt, num_cols_offd_new, + HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); + + col_map_offd_new = NULL; + } + hypre_TFree(col_map_offd_map, HYPRE_MEMORY_DEVICE); + hypre_TFree(col_map_offd_new, HYPRE_MEMORY_DEVICE); + } + } + else { - HYPRE_Int new_nnz; - HYPRE_BigInt *new_i; - HYPRE_BigInt *new_j; - HYPRE_Complex *new_data; - char *new_sora; - - /* sort and reduce */ - hypre_IJMatrixAssembleSortAndReduce1(nelms, stack_i, stack_j, stack_sora, stack_data, - &new_nnz, &new_i, &new_j, &new_sora, &new_data); - - /* adjust row indices from global to local */ - HYPRE_Int *new_i_local = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); - HYPRE_THRUST_CALL( transform, - new_i, - new_i + new_nnz, - new_i_local, - _1 - row_start ); - - hypre_TFree(new_i, HYPRE_MEMORY_DEVICE); - - HYPRE_Int num_cols_offd_new; - HYPRE_BigInt *col_map_offd_new; - HYPRE_Int *col_map_offd_map; - HYPRE_Int diag_nnz_new; - HYPRE_Int *diag_i_new = NULL; - HYPRE_Int *diag_j_new = NULL; - HYPRE_Complex *diag_a_new = NULL; - char *diag_sora_new = NULL; - HYPRE_Int offd_nnz_new; - HYPRE_Int *offd_i_new = NULL; - HYPRE_Int *offd_j_new = NULL; - HYPRE_Complex *offd_a_new = NULL; - char *offd_sora_new = NULL; - - HYPRE_Int diag_nnz_existed = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(par_matrix)); - HYPRE_Int offd_nnz_existed = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(par_matrix)); - - hypre_CSRMatrixSplitDevice_core( 0, - nrows, - new_nnz, - NULL, - new_j, - NULL, - NULL, - col_start, - col_end - 1, - hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(par_matrix)), - NULL, - NULL, - NULL, - NULL, - &diag_nnz_new, - NULL, - NULL, - NULL, - NULL, - &offd_nnz_new, - NULL, - NULL, - NULL, - NULL ); - - if (diag_nnz_new) + + in_range pred(row_start, row_end-1); + HYPRE_Int nelms_on = HYPRE_THRUST_CALL(count_if, stack_i, stack_i+nelms, pred); + HYPRE_Int nelms_off = nelms - nelms_on; + HYPRE_Int nelms_off_max; + hypre_MPI_Allreduce(&nelms_off, &nelms_off_max, 1, HYPRE_MPI_INT, hypre_MPI_MAX, comm); + + /* communicate for aux off-proc and add to remote aux on-proc */ + if (nelms_off_max) { - diag_i_new = hypre_TAlloc(HYPRE_Int, diag_nnz_existed + diag_nnz_new, HYPRE_MEMORY_DEVICE); - diag_j_new = hypre_TAlloc(HYPRE_Int, diag_nnz_existed + diag_nnz_new, HYPRE_MEMORY_DEVICE); - diag_a_new = hypre_TAlloc(HYPRE_Complex, diag_nnz_existed + diag_nnz_new, HYPRE_MEMORY_DEVICE); - if (diag_nnz_existed) + HYPRE_Int new_nnz = 0; + HYPRE_BigInt *new_i = NULL; + HYPRE_BigInt *new_j = NULL; + HYPRE_Complex *new_data = NULL; + + if (nelms_off) { - diag_sora_new = hypre_TAlloc(char, diag_nnz_existed + diag_nnz_new, HYPRE_MEMORY_DEVICE); + /* copy off-proc entries out of stack and remove from stack */ + HYPRE_BigInt *off_proc_i = hypre_TAlloc(HYPRE_BigInt, nelms_off, HYPRE_MEMORY_DEVICE); + HYPRE_BigInt *off_proc_j = hypre_TAlloc(HYPRE_BigInt, nelms_off, HYPRE_MEMORY_DEVICE); + HYPRE_Complex *off_proc_data = hypre_TAlloc(HYPRE_Complex, nelms_off, HYPRE_MEMORY_DEVICE); + char *off_proc_sora = hypre_TAlloc(char, nelms_off, HYPRE_MEMORY_DEVICE); + char *is_on_proc = hypre_TAlloc(char, nelms, HYPRE_MEMORY_DEVICE); + + HYPRE_THRUST_CALL(transform, stack_i, stack_i + nelms, is_on_proc, pred); + + auto new_end1 = HYPRE_THRUST_CALL( + copy_if, + thrust::make_zip_iterator(thrust::make_tuple(stack_i, stack_j, stack_data, stack_sora )), /* first */ + thrust::make_zip_iterator(thrust::make_tuple(stack_i + nelms, stack_j + nelms, stack_data + nelms, stack_sora + nelms)), /* last */ + is_on_proc, /* stencil */ + thrust::make_zip_iterator(thrust::make_tuple(off_proc_i, off_proc_j, off_proc_data, off_proc_sora)), /* result */ + thrust::not1(thrust::identity()) ); + + hypre_assert(thrust::get<0>(new_end1.get_iterator_tuple()) - off_proc_i == nelms_off); + + /* remove off-proc entries from stack */ + auto new_end2 = HYPRE_THRUST_CALL( + remove_if, + thrust::make_zip_iterator(thrust::make_tuple(stack_i, stack_j, stack_data, stack_sora )), /* first */ + thrust::make_zip_iterator(thrust::make_tuple(stack_i + nelms, stack_j + nelms, stack_data + nelms, stack_sora + nelms)), /* last */ + is_on_proc, /* stencil */ + thrust::not1(thrust::identity()) ); + + hypre_assert(thrust::get<0>(new_end2.get_iterator_tuple()) - stack_i == nelms_on); + + hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix) = nelms_on; + + hypre_TFree(is_on_proc, HYPRE_MEMORY_DEVICE); + + /* sort and reduce */ + hypre_IJMatrixAssembleSortAndReduce3(nelms_off, off_proc_i, off_proc_j, off_proc_sora, off_proc_data, &new_nnz, &new_i, &new_j, &new_data); + // new_nnz = hypre_IJMatrixAssembleSortAndRemove(nelms_off, off_proc_i, off_proc_j, off_proc_sora, off_proc_data); + + hypre_TFree(off_proc_i, HYPRE_MEMORY_DEVICE); + hypre_TFree(off_proc_j, HYPRE_MEMORY_DEVICE); + hypre_TFree(off_proc_data, HYPRE_MEMORY_DEVICE); + hypre_TFree(off_proc_sora, HYPRE_MEMORY_DEVICE); } + + /* send new_i/j/data to remote processes and the receivers call addtovalues */ + hypre_IJMatrixAssembleOffProcValsParCSR(matrix, -1, -1, new_nnz, HYPRE_MEMORY_DEVICE, new_i, new_j, new_data); + + hypre_TFree(new_i, HYPRE_MEMORY_DEVICE); + hypre_TFree(new_j, HYPRE_MEMORY_DEVICE); + hypre_TFree(new_data, HYPRE_MEMORY_DEVICE); } - if (offd_nnz_new) + /* Note: the stack might have been changed in hypre_IJMatrixAssembleOffProcValsParCSR, + * so must get the size and the pointers again */ + nelms = hypre_AuxParCSRMatrixCurrentStackElmts(aux_matrix); + stack_i = hypre_AuxParCSRMatrixStackI(aux_matrix); + stack_j = hypre_AuxParCSRMatrixStackJ(aux_matrix); + stack_data = hypre_AuxParCSRMatrixStackData(aux_matrix); + stack_sora = hypre_AuxParCSRMatrixStackSorA(aux_matrix); + +#ifdef HYPRE_DEBUG + /* the stack should only have on-proc elements now */ + HYPRE_Int tmp = HYPRE_THRUST_CALL(count_if, stack_i, stack_i+nelms, pred); + hypre_assert(nelms == tmp); +#endif + + if (nelms) { - offd_i_new = hypre_TAlloc(HYPRE_Int, offd_nnz_existed + offd_nnz_new, HYPRE_MEMORY_DEVICE); - offd_j_new = hypre_TAlloc(HYPRE_Int, offd_nnz_existed + offd_nnz_new, HYPRE_MEMORY_DEVICE); - offd_a_new = hypre_TAlloc(HYPRE_Complex, offd_nnz_existed + offd_nnz_new, HYPRE_MEMORY_DEVICE); - if (offd_nnz_existed) + HYPRE_Int new_nnz; + HYPRE_BigInt *new_i; + HYPRE_BigInt *new_j; + HYPRE_Complex *new_data; + char *new_sora; + + /* sort and reduce */ + hypre_IJMatrixAssembleSortAndReduce1(nelms, stack_i, stack_j, stack_sora, stack_data, + &new_nnz, &new_i, &new_j, &new_sora, &new_data); + + /* adjust row indices from global to local */ + HYPRE_Int *new_i_local = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + HYPRE_THRUST_CALL( transform, + new_i, + new_i + new_nnz, + new_i_local, + _1 - row_start ); + + hypre_TFree(new_i, HYPRE_MEMORY_DEVICE); + + HYPRE_Int num_cols_offd_new; + HYPRE_BigInt *col_map_offd_new; + HYPRE_Int *col_map_offd_map; + HYPRE_Int diag_nnz_new; + HYPRE_Int *diag_i_new = NULL; + HYPRE_Int *diag_j_new = NULL; + HYPRE_Complex *diag_a_new = NULL; + char *diag_sora_new = NULL; + HYPRE_Int offd_nnz_new; + HYPRE_Int *offd_i_new = NULL; + HYPRE_Int *offd_j_new = NULL; + HYPRE_Complex *offd_a_new = NULL; + char *offd_sora_new = NULL; + + HYPRE_Int diag_nnz_existed = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixDiag(par_matrix)); + HYPRE_Int offd_nnz_existed = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(par_matrix)); + + hypre_CSRMatrixSplitDevice_core( 0, + nrows, + new_nnz, + NULL, + new_j, + NULL, + NULL, + col_start, + col_end - 1, + hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(par_matrix)), + NULL, + NULL, + NULL, + NULL, + &diag_nnz_new, + NULL, + NULL, + NULL, + NULL, + &offd_nnz_new, + NULL, + NULL, + NULL, + NULL ); + + if (diag_nnz_new) { - offd_sora_new = hypre_TAlloc(char, offd_nnz_existed + offd_nnz_new, HYPRE_MEMORY_DEVICE); + diag_i_new = hypre_TAlloc(HYPRE_Int, diag_nnz_existed + diag_nnz_new, HYPRE_MEMORY_DEVICE); + diag_j_new = hypre_TAlloc(HYPRE_Int, diag_nnz_existed + diag_nnz_new, HYPRE_MEMORY_DEVICE); + diag_a_new = hypre_TAlloc(HYPRE_Complex, diag_nnz_existed + diag_nnz_new, HYPRE_MEMORY_DEVICE); + if (diag_nnz_existed) + { + diag_sora_new = hypre_TAlloc(char, diag_nnz_existed + diag_nnz_new, HYPRE_MEMORY_DEVICE); + } } - } - /* split IJ into diag and offd */ - hypre_CSRMatrixSplitDevice_core( 1, - nrows, - new_nnz, - new_i_local, - new_j, - new_data, - diag_nnz_existed || offd_nnz_existed ? new_sora : NULL, - col_start, - col_end - 1, - hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(par_matrix)), - hypre_ParCSRMatrixDeviceColMapOffd(par_matrix), - &col_map_offd_map, - &num_cols_offd_new, - &col_map_offd_new, - &diag_nnz_new, - diag_i_new + diag_nnz_existed, - diag_j_new + diag_nnz_existed, - diag_a_new + diag_nnz_existed, - diag_nnz_existed ? diag_sora_new + diag_nnz_existed : NULL, - &offd_nnz_new, - offd_i_new + offd_nnz_existed, - offd_j_new + offd_nnz_existed, - offd_a_new + offd_nnz_existed, - offd_nnz_existed ? offd_sora_new + offd_nnz_existed : NULL ); - - hypre_TFree(new_i_local, HYPRE_MEMORY_DEVICE); - hypre_TFree(new_j, HYPRE_MEMORY_DEVICE); - hypre_TFree(new_data, HYPRE_MEMORY_DEVICE); - hypre_TFree(new_sora, HYPRE_MEMORY_DEVICE); - - HYPRE_Int nnz_new; - HYPRE_Int *tmp_i; - HYPRE_Int *tmp_j; - HYPRE_Complex *tmp_a; - - /* expand the existing diag/offd and compress with the new one */ - if (diag_nnz_new > 0) - { - if (diag_nnz_existed) + if (offd_nnz_new) { - /* the existing parcsr should come first and the entries are "add" */ - hypreDevice_CsrRowPtrsToIndices_v2(nrows, diag_nnz_existed, - hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(par_matrix)), diag_i_new); + offd_i_new = hypre_TAlloc(HYPRE_Int, offd_nnz_existed + offd_nnz_new, HYPRE_MEMORY_DEVICE); + offd_j_new = hypre_TAlloc(HYPRE_Int, offd_nnz_existed + offd_nnz_new, HYPRE_MEMORY_DEVICE); + offd_a_new = hypre_TAlloc(HYPRE_Complex, offd_nnz_existed + offd_nnz_new, HYPRE_MEMORY_DEVICE); + if (offd_nnz_existed) + { + offd_sora_new = hypre_TAlloc(char, offd_nnz_existed + offd_nnz_new, HYPRE_MEMORY_DEVICE); + } + } - hypre_TMemcpy(diag_j_new, hypre_CSRMatrixJ(hypre_ParCSRMatrixDiag(par_matrix)), HYPRE_Int, - diag_nnz_existed, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + /* split IJ into diag and offd */ + hypre_CSRMatrixSplitDevice_core( 1, + nrows, + new_nnz, + new_i_local, + new_j, + new_data, + diag_nnz_existed || offd_nnz_existed ? new_sora : NULL, + col_start, + col_end - 1, + hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(par_matrix)), + hypre_ParCSRMatrixDeviceColMapOffd(par_matrix), + &col_map_offd_map, + &num_cols_offd_new, + &col_map_offd_new, + &diag_nnz_new, + diag_i_new + diag_nnz_existed, + diag_j_new + diag_nnz_existed, + diag_a_new + diag_nnz_existed, + diag_nnz_existed ? diag_sora_new + diag_nnz_existed : NULL, + &offd_nnz_new, + offd_i_new + offd_nnz_existed, + offd_j_new + offd_nnz_existed, + offd_a_new + offd_nnz_existed, + offd_nnz_existed ? offd_sora_new + offd_nnz_existed : NULL ); + + hypre_TFree(new_i_local, HYPRE_MEMORY_DEVICE); + hypre_TFree(new_j, HYPRE_MEMORY_DEVICE); + hypre_TFree(new_data, HYPRE_MEMORY_DEVICE); + hypre_TFree(new_sora, HYPRE_MEMORY_DEVICE); + + HYPRE_Int nnz_new; + HYPRE_Int *tmp_i; + HYPRE_Int *tmp_j; + HYPRE_Complex *tmp_a; + + /* expand the existing diag/offd and compress with the new one */ + if (diag_nnz_new > 0) + { + if (diag_nnz_existed) + { + /* the existing parcsr should come first and the entries are "add" */ + hypreDevice_CsrRowPtrsToIndices_v2(nrows, diag_nnz_existed, + hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(par_matrix)), diag_i_new); - hypre_TMemcpy(diag_a_new, hypre_CSRMatrixData(hypre_ParCSRMatrixDiag(par_matrix)), HYPRE_Complex, - diag_nnz_existed, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(diag_j_new, hypre_CSRMatrixJ(hypre_ParCSRMatrixDiag(par_matrix)), HYPRE_Int, + diag_nnz_existed, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - HYPRE_THRUST_CALL(fill_n, diag_sora_new, diag_nnz_existed, 0); + hypre_TMemcpy(diag_a_new, hypre_CSRMatrixData(hypre_ParCSRMatrixDiag(par_matrix)), HYPRE_Complex, + diag_nnz_existed, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_IJMatrixAssembleSortAndReduce2(diag_nnz_existed + diag_nnz_new, diag_i_new, diag_j_new, diag_sora_new, diag_a_new, - &nnz_new, &tmp_i, &tmp_j, &tmp_a, 2); + HYPRE_THRUST_CALL(fill_n, diag_sora_new, diag_nnz_existed, 0); - hypre_TFree(diag_i_new, HYPRE_MEMORY_DEVICE); - hypre_TFree(diag_j_new, HYPRE_MEMORY_DEVICE); - hypre_TFree(diag_sora_new, HYPRE_MEMORY_DEVICE); - hypre_TFree(diag_a_new, HYPRE_MEMORY_DEVICE); + hypre_IJMatrixAssembleSortAndReduce2(diag_nnz_existed + diag_nnz_new, diag_i_new, diag_j_new, diag_sora_new, diag_a_new, + &nnz_new, &tmp_i, &tmp_j, &tmp_a, 2); - tmp_j = hypre_TReAlloc_v2(tmp_j, HYPRE_Int, diag_nnz_existed + diag_nnz_new, HYPRE_Int, nnz_new, HYPRE_MEMORY_DEVICE); - tmp_a = hypre_TReAlloc_v2(tmp_a, HYPRE_Complex, diag_nnz_existed + diag_nnz_new, HYPRE_Complex, nnz_new, HYPRE_MEMORY_DEVICE); + hypre_TFree(diag_i_new, HYPRE_MEMORY_DEVICE); + hypre_TFree(diag_j_new, HYPRE_MEMORY_DEVICE); + hypre_TFree(diag_sora_new, HYPRE_MEMORY_DEVICE); + hypre_TFree(diag_a_new, HYPRE_MEMORY_DEVICE); - diag_nnz_new = nnz_new; - diag_i_new = tmp_i; - diag_j_new = tmp_j; - diag_a_new = tmp_a; - } + tmp_j = hypre_TReAlloc_v2(tmp_j, HYPRE_Int, diag_nnz_existed + diag_nnz_new, HYPRE_Int, nnz_new, HYPRE_MEMORY_DEVICE); + tmp_a = hypre_TReAlloc_v2(tmp_a, HYPRE_Complex, diag_nnz_existed + diag_nnz_new, HYPRE_Complex, nnz_new, HYPRE_MEMORY_DEVICE); - hypre_CSRMatrix *diag = hypre_CSRMatrixCreate(nrows, ncols, diag_nnz_new); - hypre_CSRMatrixI(diag) = hypreDevice_CsrRowIndicesToPtrs(nrows, diag_nnz_new, diag_i_new); - hypre_CSRMatrixJ(diag) = diag_j_new; - hypre_CSRMatrixData(diag) = diag_a_new; - hypre_CSRMatrixMemoryLocation(diag) = HYPRE_MEMORY_DEVICE; + diag_nnz_new = nnz_new; + diag_i_new = tmp_i; + diag_j_new = tmp_j; + diag_a_new = tmp_a; + } - hypre_TFree(diag_i_new, HYPRE_MEMORY_DEVICE); + hypre_CSRMatrix *diag = hypre_CSRMatrixCreate(nrows, ncols, diag_nnz_new); + hypre_CSRMatrixI(diag) = hypreDevice_CsrRowIndicesToPtrs(nrows, diag_nnz_new, diag_i_new); + hypre_CSRMatrixJ(diag) = diag_j_new; + hypre_CSRMatrixData(diag) = diag_a_new; + hypre_CSRMatrixMemoryLocation(diag) = HYPRE_MEMORY_DEVICE; - hypre_CSRMatrixDestroy(hypre_ParCSRMatrixDiag(par_matrix)); - hypre_ParCSRMatrixDiag(par_matrix) = diag; - } + hypre_TFree(diag_i_new, HYPRE_MEMORY_DEVICE); - if (offd_nnz_new > 0) - { - if (offd_nnz_existed) + hypre_CSRMatrixDestroy(hypre_ParCSRMatrixDiag(par_matrix)); + hypre_ParCSRMatrixDiag(par_matrix) = diag; + } + + if (offd_nnz_new > 0) { - /* the existing parcsr should come first and the entries are "add" */ - hypreDevice_CsrRowPtrsToIndices_v2(nrows, offd_nnz_existed, - hypre_CSRMatrixI(hypre_ParCSRMatrixOffd(par_matrix)), offd_i_new); + if (offd_nnz_existed) + { + /* the existing parcsr should come first and the entries are "add" */ + hypreDevice_CsrRowPtrsToIndices_v2(nrows, offd_nnz_existed, + hypre_CSRMatrixI(hypre_ParCSRMatrixOffd(par_matrix)), offd_i_new); - /* adjust with the new col_map_offd_map */ - HYPRE_THRUST_CALL( gather, - hypre_CSRMatrixJ(hypre_ParCSRMatrixOffd(par_matrix)), - hypre_CSRMatrixJ(hypre_ParCSRMatrixOffd(par_matrix)) + offd_nnz_existed, - col_map_offd_map, - offd_j_new ); + /* adjust with the new col_map_offd_map */ + HYPRE_THRUST_CALL( gather, + hypre_CSRMatrixJ(hypre_ParCSRMatrixOffd(par_matrix)), + hypre_CSRMatrixJ(hypre_ParCSRMatrixOffd(par_matrix)) + offd_nnz_existed, + col_map_offd_map, + offd_j_new ); - hypre_TMemcpy(offd_a_new, hypre_CSRMatrixData(hypre_ParCSRMatrixOffd(par_matrix)), HYPRE_Complex, - offd_nnz_existed, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(offd_a_new, hypre_CSRMatrixData(hypre_ParCSRMatrixOffd(par_matrix)), HYPRE_Complex, + offd_nnz_existed, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - HYPRE_THRUST_CALL(fill_n, offd_sora_new, offd_nnz_existed, 0); + HYPRE_THRUST_CALL(fill_n, offd_sora_new, offd_nnz_existed, 0); - hypre_IJMatrixAssembleSortAndReduce2(offd_nnz_existed + offd_nnz_new, offd_i_new, offd_j_new, offd_sora_new, offd_a_new, - &nnz_new, &tmp_i, &tmp_j, &tmp_a, 0); + hypre_IJMatrixAssembleSortAndReduce2(offd_nnz_existed + offd_nnz_new, offd_i_new, offd_j_new, offd_sora_new, offd_a_new, + &nnz_new, &tmp_i, &tmp_j, &tmp_a, 0); - hypre_TFree(offd_i_new, HYPRE_MEMORY_DEVICE); - hypre_TFree(offd_j_new, HYPRE_MEMORY_DEVICE); - hypre_TFree(offd_sora_new, HYPRE_MEMORY_DEVICE); - hypre_TFree(offd_a_new, HYPRE_MEMORY_DEVICE); + hypre_TFree(offd_i_new, HYPRE_MEMORY_DEVICE); + hypre_TFree(offd_j_new, HYPRE_MEMORY_DEVICE); + hypre_TFree(offd_sora_new, HYPRE_MEMORY_DEVICE); + hypre_TFree(offd_a_new, HYPRE_MEMORY_DEVICE); - tmp_j = hypre_TReAlloc_v2(tmp_j, HYPRE_Int, offd_nnz_existed + offd_nnz_new, HYPRE_Int, nnz_new, HYPRE_MEMORY_DEVICE); - tmp_a = hypre_TReAlloc_v2(tmp_a, HYPRE_Complex, offd_nnz_existed + offd_nnz_new, HYPRE_Complex, nnz_new, HYPRE_MEMORY_DEVICE); + tmp_j = hypre_TReAlloc_v2(tmp_j, HYPRE_Int, offd_nnz_existed + offd_nnz_new, HYPRE_Int, nnz_new, HYPRE_MEMORY_DEVICE); + tmp_a = hypre_TReAlloc_v2(tmp_a, HYPRE_Complex, offd_nnz_existed + offd_nnz_new, HYPRE_Complex, nnz_new, HYPRE_MEMORY_DEVICE); - offd_nnz_new = nnz_new; - offd_i_new = tmp_i; - offd_j_new = tmp_j; - offd_a_new = tmp_a; - } + offd_nnz_new = nnz_new; + offd_i_new = tmp_i; + offd_j_new = tmp_j; + offd_a_new = tmp_a; + } - hypre_CSRMatrix *offd = hypre_CSRMatrixCreate(nrows, num_cols_offd_new, offd_nnz_new); - hypre_CSRMatrixI(offd) = hypreDevice_CsrRowIndicesToPtrs(nrows, offd_nnz_new, offd_i_new); - hypre_CSRMatrixJ(offd) = offd_j_new; - hypre_CSRMatrixData(offd) = offd_a_new; - hypre_CSRMatrixMemoryLocation(offd) = HYPRE_MEMORY_DEVICE; + hypre_CSRMatrix *offd = hypre_CSRMatrixCreate(nrows, num_cols_offd_new, offd_nnz_new); + hypre_CSRMatrixI(offd) = hypreDevice_CsrRowIndicesToPtrs(nrows, offd_nnz_new, offd_i_new); + hypre_CSRMatrixJ(offd) = offd_j_new; + hypre_CSRMatrixData(offd) = offd_a_new; + hypre_CSRMatrixMemoryLocation(offd) = HYPRE_MEMORY_DEVICE; - hypre_TFree(offd_i_new, HYPRE_MEMORY_DEVICE); + hypre_TFree(offd_i_new, HYPRE_MEMORY_DEVICE); - hypre_CSRMatrixDestroy(hypre_ParCSRMatrixOffd(par_matrix)); - hypre_ParCSRMatrixOffd(par_matrix) = offd; + hypre_CSRMatrixDestroy(hypre_ParCSRMatrixOffd(par_matrix)); + hypre_ParCSRMatrixOffd(par_matrix) = offd; - hypre_TFree(hypre_ParCSRMatrixDeviceColMapOffd(par_matrix), HYPRE_MEMORY_DEVICE); - hypre_ParCSRMatrixDeviceColMapOffd(par_matrix) = col_map_offd_new; + hypre_TFree(hypre_ParCSRMatrixDeviceColMapOffd(par_matrix), HYPRE_MEMORY_DEVICE); + hypre_ParCSRMatrixDeviceColMapOffd(par_matrix) = col_map_offd_new; - hypre_TFree(hypre_ParCSRMatrixColMapOffd(par_matrix), HYPRE_MEMORY_HOST); - hypre_ParCSRMatrixColMapOffd(par_matrix) = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_new, HYPRE_MEMORY_HOST); - hypre_TMemcpy(hypre_ParCSRMatrixColMapOffd(par_matrix), col_map_offd_new, HYPRE_BigInt, num_cols_offd_new, - HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); + hypre_TFree(hypre_ParCSRMatrixColMapOffd(par_matrix), HYPRE_MEMORY_HOST); + hypre_ParCSRMatrixColMapOffd(par_matrix) = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_new, HYPRE_MEMORY_HOST); + hypre_TMemcpy(hypre_ParCSRMatrixColMapOffd(par_matrix), col_map_offd_new, HYPRE_BigInt, num_cols_offd_new, + HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); - col_map_offd_new = NULL; - } + col_map_offd_new = NULL; + } - hypre_TFree(col_map_offd_map, HYPRE_MEMORY_DEVICE); - hypre_TFree(col_map_offd_new, HYPRE_MEMORY_DEVICE); - } /* if (nelms) */ + hypre_TFree(col_map_offd_map, HYPRE_MEMORY_DEVICE); + hypre_TFree(col_map_offd_new, HYPRE_MEMORY_DEVICE); + } /* if (nelms) */ + } hypre_IJMatrixAssembleFlag(matrix) = 1; hypre_AuxParCSRMatrixDestroy(aux_matrix); hypre_IJMatrixTranslator(matrix) = NULL; - return hypre_error_flag; } diff --git a/src/IJ_mv/IJVector_parcsr.c b/src/IJ_mv/IJVector_parcsr.c index f2e635d825..e9e2d2a39d 100644 --- a/src/IJ_mv/IJVector_parcsr.c +++ b/src/IJ_mv/IJVector_parcsr.c @@ -145,6 +145,84 @@ hypre_IJVectorSetMaxOffProcElmtsPar(hypre_IJVector *vector, return hypre_error_flag; } +/****************************************************************************** + * + * hypre_IJVectorSetMaxOnProcElmtsPar + * + *****************************************************************************/ + +HYPRE_Int +hypre_IJVectorSetMaxOnProcElmtsPar(hypre_IJVector *vector, + HYPRE_Int max_on_proc_elmts) +{ + hypre_AuxParVector *aux_vector; + + aux_vector = (hypre_AuxParVector*) hypre_IJVectorTranslator(vector); + if (!aux_vector) + { + hypre_AuxParVectorCreate(&aux_vector); + hypre_IJVectorTranslator(vector) = aux_vector; + } + +#if defined(HYPRE_USING_CUDA) + hypre_AuxParVectorUsrOnProcElmts(aux_vector) = max_on_proc_elmts; +#endif + + return hypre_error_flag; +} + +/****************************************************************************** + * + * hypre_IJVectorSetOffProcSendElmtsPar + * + *****************************************************************************/ + +HYPRE_Int +hypre_IJVectorSetOffProcSendElmtsPar(hypre_IJVector *vector, + HYPRE_Int off_proc_send_elmts) +{ + hypre_AuxParVector *aux_vector; + + aux_vector = (hypre_AuxParVector*) hypre_IJVectorTranslator(vector); + if (!aux_vector) + { + hypre_AuxParVectorCreate(&aux_vector); + hypre_IJVectorTranslator(vector) = aux_vector; + } + +#if defined(HYPRE_USING_CUDA) + hypre_AuxParVectorUsrOffProcSendElmts(aux_vector) = off_proc_send_elmts; +#endif + + return hypre_error_flag; +} + +/****************************************************************************** + * + * hypre_IJVectorSetOffProcRecvElmtsPar + * + *****************************************************************************/ + +HYPRE_Int +hypre_IJVectorSetOffProcRecvElmtsPar(hypre_IJVector *vector, + HYPRE_Int off_proc_recv_elmts) +{ + hypre_AuxParVector *aux_vector; + + aux_vector = (hypre_AuxParVector*) hypre_IJVectorTranslator(vector); + if (!aux_vector) + { + hypre_AuxParVectorCreate(&aux_vector); + hypre_IJVectorTranslator(vector) = aux_vector; + } + +#if defined(HYPRE_USING_CUDA) + hypre_AuxParVectorUsrOffProcRecvElmts(aux_vector) = off_proc_recv_elmts; +#endif + + return hypre_error_flag; +} + /****************************************************************************** * * hypre_IJVectorDistributePar diff --git a/src/IJ_mv/IJVector_parcsr_device.c b/src/IJ_mv/IJVector_parcsr_device.c index 46e65ab0bd..9d67d1daf2 100644 --- a/src/IJ_mv/IJVector_parcsr_device.c +++ b/src/IJ_mv/IJVector_parcsr_device.c @@ -27,12 +27,16 @@ struct hypre_IJVectorAssembleFunctor : public thrust::binary_function< thrust::t } }; +HYPRE_Int hypre_IJVectorAssembleSortAndReduce0(HYPRE_Int N, HYPRE_Int N0, HYPRE_BigInt *I0, HYPRE_Complex *A0, HYPRE_Int *N1, HYPRE_BigInt **I1, HYPRE_Complex **A1 ); + HYPRE_Int hypre_IJVectorAssembleSortAndReduce3(HYPRE_Int N0, HYPRE_BigInt *I0, char *X0, HYPRE_Complex *A0, HYPRE_Int *N1, HYPRE_BigInt **I1, HYPRE_Complex **A1); HYPRE_Int hypre_IJVectorAssembleSortAndReduce1(HYPRE_Int N0, HYPRE_BigInt *I0, char *X0, HYPRE_Complex *A0, HYPRE_Int *N1, HYPRE_BigInt **I1, char **X1, HYPRE_Complex **A1 ); __global__ void hypreCUDAKernel_IJVectorAssemblePar(HYPRE_Int n, HYPRE_Complex *x, HYPRE_BigInt *map, HYPRE_BigInt offset, char *SorA, HYPRE_Complex *y); +__global__ void hypreCUDAKernel_IJVectorAssembleParSet(HYPRE_Int m,HYPRE_Int n, HYPRE_Complex *x, HYPRE_BigInt *map, HYPRE_BigInt offset, HYPRE_Complex *y); + /* */ HYPRE_Int @@ -86,33 +90,71 @@ hypre_IJVectorSetAddValuesParDevice(hypre_IJVector *vector, HYPRE_Complex *stack_data = hypre_AuxParVectorStackData(aux_vector); char *stack_sora = hypre_AuxParVectorStackSorA(aux_vector); - if ( stack_elmts_max < stack_elmts_required ) + /* Each of these parameters must be set to enter this simplified path */ + if (hypre_AuxParVectorUsrOnProcElmts(aux_vector)>=0 && + hypre_AuxParVectorUsrOffProcSendElmts(aux_vector)>=0 && + hypre_AuxParVectorUsrOffProcRecvElmts(aux_vector)>=0) { - HYPRE_Int stack_elmts_max_new = nrows * hypre_AuxParVectorInitAllocFactor(aux_vector); - if (hypre_AuxParVectorUsrOffProcElmts(aux_vector) >= 0) - { - stack_elmts_max_new += hypre_AuxParVectorUsrOffProcElmts(aux_vector); + HYPRE_Int stack_elmts_on_proc = hypre_AuxParVectorUsrOnProcElmts(aux_vector); + HYPRE_Int stack_elmts_off_proc_send = hypre_AuxParVectorUsrOffProcSendElmts(aux_vector); + HYPRE_Int stack_elmts_off_proc_recv = hypre_AuxParVectorUsrOffProcRecvElmts(aux_vector); + char usr_elmts_filled = hypre_AuxParVectorUsrElmtsFilled(aux_vector); + //printf("%s %s %d : on=%d, send=%d, recv=%d, current=%d, filled=%d\n",__FILE__,__FUNCTION__,__LINE__, + // stack_elmts_on_proc,stack_elmts_off_proc_send,stack_elmts_off_proc_recv,stack_elmts_current,usr_elmts_filled); + + /* Neither has been filled ... allocate space */ + if (!stack_i && !stack_data) { + hypre_AuxParVectorStackI(aux_vector) = stack_i = const_cast(indices); + hypre_AuxParVectorStackData(aux_vector) = stack_data = const_cast(values); } - stack_elmts_max_new = hypre_max(stack_elmts_max * hypre_AuxParVectorGrowFactor(aux_vector), stack_elmts_max_new); - stack_elmts_max_new = hypre_max(stack_elmts_required, stack_elmts_max_new); - hypre_AuxParVectorStackI(aux_vector) = stack_i = - hypre_TReAlloc_v2(stack_i, HYPRE_BigInt, stack_elmts_max, HYPRE_BigInt, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); - hypre_AuxParVectorStackData(aux_vector) = stack_data = - hypre_TReAlloc_v2(stack_data, HYPRE_Complex, stack_elmts_max, HYPRE_Complex, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); - hypre_AuxParVectorStackSorA(aux_vector) = stack_sora = - hypre_TReAlloc_v2(stack_sora, char, stack_elmts_max, char, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); + /* grab the current version of the stack size ... used for SetAddValues called from IJAssemble */ + if (SorA==1 && usr_elmts_filled==0) { + /* set the current stack position to 0 and set the filled flag to 1 as this routine is going to fill */ + hypre_assert(stack_elmts_on_proc == num_values); + hypre_AuxParVectorCurrentStackElmts(aux_vector) = stack_elmts_current = 0; + } + else if (SorA==0 && usr_elmts_filled==0) + { + /* set the current stack position to stack_elmts_on_proc + stack_elmts_off_proc_recv; and set the corresponding filled flag to 1 as this routine is going to fill */ + hypre_assert(stack_elmts_off_proc_send == num_values); + hypre_AuxParVectorCurrentStackElmts(aux_vector) = stack_elmts_current = stack_elmts_on_proc; + } - hypre_AuxParVectorMaxStackElmts(aux_vector) = stack_elmts_max_new; - } + if (usr_elmts_filled) { + hypre_TMemcpy(stack_i + stack_elmts_current, indices, HYPRE_BigInt, num_values, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(stack_data + stack_elmts_current, values, HYPRE_Complex, num_values, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + } + } + else + { + if ( stack_elmts_max < stack_elmts_required ) + { + HYPRE_Int stack_elmts_max_new = nrows * hypre_AuxParVectorInitAllocFactor(aux_vector); + if (hypre_AuxParVectorUsrOffProcElmts(aux_vector) >= 0) + { + stack_elmts_max_new += hypre_AuxParVectorUsrOffProcElmts(aux_vector); + } + stack_elmts_max_new = hypre_max(stack_elmts_max * hypre_AuxParVectorGrowFactor(aux_vector), stack_elmts_max_new); + stack_elmts_max_new = hypre_max(stack_elmts_required, stack_elmts_max_new); + + hypre_AuxParVectorStackI(aux_vector) = stack_i = + hypre_TReAlloc_v2(stack_i, HYPRE_BigInt, stack_elmts_max, HYPRE_BigInt, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); + hypre_AuxParVectorStackData(aux_vector) = stack_data = + hypre_TReAlloc_v2(stack_data, HYPRE_Complex, stack_elmts_max, HYPRE_Complex, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); + hypre_AuxParVectorStackSorA(aux_vector) = stack_sora = + hypre_TReAlloc_v2(stack_sora, char, stack_elmts_max, char, stack_elmts_max_new, HYPRE_MEMORY_DEVICE); + + hypre_AuxParVectorMaxStackElmts(aux_vector) = stack_elmts_max_new; + } - HYPRE_THRUST_CALL(fill_n, stack_sora + stack_elmts_current, num_values, SorA); + HYPRE_THRUST_CALL(fill_n, stack_sora + stack_elmts_current, num_values, SorA); - hypre_TMemcpy(stack_i + stack_elmts_current, indices, HYPRE_BigInt, num_values, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(stack_data + stack_elmts_current, values, HYPRE_Complex, num_values, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(stack_i + stack_elmts_current, indices, HYPRE_BigInt, num_values, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(stack_data + stack_elmts_current, values, HYPRE_Complex, num_values, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + } hypre_AuxParVectorCurrentStackElmts(aux_vector) += num_values; - return hypre_error_flag; } @@ -147,108 +189,208 @@ hypre_IJVectorAssembleParDevice(hypre_IJVector *vector) HYPRE_Complex *stack_data = hypre_AuxParVectorStackData(aux_vector); char *stack_sora = hypre_AuxParVectorStackSorA(aux_vector); - in_range pred(vec_start, vec_stop); - HYPRE_Int nelms_on = HYPRE_THRUST_CALL(count_if, stack_i, stack_i+nelms, pred); - HYPRE_Int nelms_off = nelms - nelms_on; - HYPRE_Int nelms_off_max; - hypre_MPI_Allreduce(&nelms_off, &nelms_off_max, 1, HYPRE_MPI_INT, hypre_MPI_MAX, comm); + HYPRE_Int nelms_off_send = hypre_AuxParVectorUsrOffProcSendElmts(aux_vector); + HYPRE_Int nelms_off_recv = hypre_AuxParVectorUsrOffProcRecvElmts(aux_vector); + HYPRE_Int nelms_on = hypre_AuxParVectorUsrOnProcElmts(aux_vector); - /* communicate for aux off-proc and add to remote aux on-proc */ - if (nelms_off_max) + if (nelms_on>=0 && nelms_off_recv>=0 && nelms_off_send>=0) { - HYPRE_Int new_nnz = 0; - HYPRE_BigInt *new_i = NULL; - HYPRE_Complex *new_data = NULL; + //printf("%s %s %d : on=%d, send=%d, recv=%d\n",__FILE__,__FUNCTION__,__LINE__,nelms_on,nelms_off_send,nelms_off_recv); - if (nelms_off) + if (nelms_off_send>=0 || nelms_off_recv>=0) { - /* copy off-proc entries out of stack and remove from stack */ - HYPRE_BigInt *off_proc_i = hypre_TAlloc(HYPRE_BigInt, nelms_off, HYPRE_MEMORY_DEVICE); - HYPRE_Complex *off_proc_data = hypre_TAlloc(HYPRE_Complex, nelms_off, HYPRE_MEMORY_DEVICE); - char *off_proc_sora = hypre_TAlloc(char, nelms_off, HYPRE_MEMORY_DEVICE); - char *is_on_proc = hypre_TAlloc(char, nelms, HYPRE_MEMORY_DEVICE); + HYPRE_Int new_nnz = 0; //nelms_off_send; + HYPRE_BigInt *new_i = NULL; + HYPRE_Complex *new_data = NULL; + + if (nelms_off_send>0) + { + /* set the pointers */ + new_nnz = nelms_off_send; + new_i = stack_i + nelms_on; + new_data = stack_data + nelms_on; + } + + /* reset this */ + hypre_AuxParVectorCurrentStackElmts(aux_vector) = nelms_on; + + /* enforce this here */ + hypre_AuxParVectorUsrElmtsFilled(aux_vector) = 1; + + /* send new_i/j/data to remote processes and the receivers call addtovalues */ + hypre_IJVectorAssembleOffProcValsPar(vector, -1, new_nnz, HYPRE_MEMORY_DEVICE, new_i, new_data); + } - HYPRE_THRUST_CALL(transform, stack_i, stack_i + nelms, is_on_proc, pred); + /* Note: the stack might have been changed in hypre_IJVectorAssembleOffProcValsPar, + * so must get the size and the pointers again */ + nelms = hypre_AuxParVectorCurrentStackElmts(aux_vector); + stack_i = hypre_AuxParVectorStackI(aux_vector); + stack_data = hypre_AuxParVectorStackData(aux_vector); + + if (nelms) + { + HYPRE_Int new_nnz; + HYPRE_BigInt *new_i; + HYPRE_Complex *new_data; - auto new_end1 = HYPRE_THRUST_CALL( - copy_if, - thrust::make_zip_iterator(thrust::make_tuple(stack_i, stack_data, stack_sora )), /* first */ - thrust::make_zip_iterator(thrust::make_tuple(stack_i + nelms, stack_data + nelms, stack_sora + nelms)), /* last */ - is_on_proc, /* stencil */ - thrust::make_zip_iterator(thrust::make_tuple(off_proc_i, off_proc_data, off_proc_sora)), /* result */ - thrust::not1(thrust::identity()) ); + /* sort and reduce */ + hypre_IJVectorAssembleSortAndReduce0(nelms_on, nelms, stack_i, stack_data, &new_nnz, &new_i, &new_data); - hypre_assert(thrust::get<0>(new_end1.get_iterator_tuple()) - off_proc_i == nelms_off); + /* set/add to local vector */ + dim3 bDim = hypre_GetDefaultCUDABlockDimension(); + dim3 gDim = hypre_GetDefaultCUDAGridDimension(new_nnz, "thread", bDim); + hypre_TMemcpy(hypre_VectorData(hypre_ParVectorLocalVector(par_vector)), new_data, HYPRE_Complex, nelms_on, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - /* remove off-proc entries from stack */ - auto new_end2 = HYPRE_THRUST_CALL( - remove_if, - thrust::make_zip_iterator(thrust::make_tuple(stack_i, stack_data, stack_sora )), /* first */ - thrust::make_zip_iterator(thrust::make_tuple(stack_i + nelms, stack_data + nelms, stack_sora + nelms)), /* last */ - is_on_proc, /* stencil */ - thrust::not1(thrust::identity()) ); + HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJVectorAssembleParSet, gDim, bDim, nelms_on, new_nnz-nelms_on, new_data, new_i, vec_start, + hypre_VectorData(hypre_ParVectorLocalVector(par_vector)) ); - hypre_assert(thrust::get<0>(new_end2.get_iterator_tuple()) - stack_i == nelms_on); + hypre_TFree(new_i, HYPRE_MEMORY_DEVICE); + hypre_TFree(new_data, HYPRE_MEMORY_DEVICE); + } + } + else + { - hypre_AuxParVectorCurrentStackElmts(aux_vector) = nelms_on; + in_range pred(vec_start, vec_stop); + HYPRE_Int nelms_on = HYPRE_THRUST_CALL(count_if, stack_i, stack_i+nelms, pred); + HYPRE_Int nelms_off = nelms - nelms_on; + HYPRE_Int nelms_off_max; + hypre_MPI_Allreduce(&nelms_off, &nelms_off_max, 1, HYPRE_MPI_INT, hypre_MPI_MAX, comm); - hypre_TFree(is_on_proc, HYPRE_MEMORY_DEVICE); + /* communicate for aux off-proc and add to remote aux on-proc */ + if (nelms_off_max) + { + HYPRE_Int new_nnz = 0; + HYPRE_BigInt *new_i = NULL; + HYPRE_Complex *new_data = NULL; - /* sort and reduce */ - hypre_IJVectorAssembleSortAndReduce3(nelms_off, off_proc_i, off_proc_sora, off_proc_data, &new_nnz, &new_i, &new_data); + if (nelms_off) + { + /* copy off-proc entries out of stack and remove from stack */ + HYPRE_BigInt *off_proc_i = hypre_TAlloc(HYPRE_BigInt, nelms_off, HYPRE_MEMORY_DEVICE); + HYPRE_Complex *off_proc_data = hypre_TAlloc(HYPRE_Complex, nelms_off, HYPRE_MEMORY_DEVICE); + char *off_proc_sora = hypre_TAlloc(char, nelms_off, HYPRE_MEMORY_DEVICE); + char *is_on_proc = hypre_TAlloc(char, nelms, HYPRE_MEMORY_DEVICE); - hypre_TFree(off_proc_i, HYPRE_MEMORY_DEVICE); - hypre_TFree(off_proc_data, HYPRE_MEMORY_DEVICE); - hypre_TFree(off_proc_sora, HYPRE_MEMORY_DEVICE); - } + HYPRE_THRUST_CALL(transform, stack_i, stack_i + nelms, is_on_proc, pred); - /* send new_i/data to remote processes and the receivers call addtovalues */ - hypre_IJVectorAssembleOffProcValsPar(vector, -1, new_nnz, HYPRE_MEMORY_DEVICE, new_i, new_data); + auto new_end1 = HYPRE_THRUST_CALL( + copy_if, + thrust::make_zip_iterator(thrust::make_tuple(stack_i, stack_data, stack_sora )), /* first */ + thrust::make_zip_iterator(thrust::make_tuple(stack_i + nelms, stack_data + nelms, stack_sora + nelms)), /* last */ + is_on_proc, /* stencil */ + thrust::make_zip_iterator(thrust::make_tuple(off_proc_i, off_proc_data, off_proc_sora)), /* result */ + thrust::not1(thrust::identity()) ); - hypre_TFree(new_i, HYPRE_MEMORY_DEVICE); - hypre_TFree(new_data, HYPRE_MEMORY_DEVICE); - } + hypre_assert(thrust::get<0>(new_end1.get_iterator_tuple()) - off_proc_i == nelms_off); + + /* remove off-proc entries from stack */ + auto new_end2 = HYPRE_THRUST_CALL( + remove_if, + thrust::make_zip_iterator(thrust::make_tuple(stack_i, stack_data, stack_sora )), /* first */ + thrust::make_zip_iterator(thrust::make_tuple(stack_i + nelms, stack_data + nelms, stack_sora + nelms)), /* last */ + is_on_proc, /* stencil */ + thrust::not1(thrust::identity()) ); + + hypre_assert(thrust::get<0>(new_end2.get_iterator_tuple()) - stack_i == nelms_on); + + hypre_AuxParVectorCurrentStackElmts(aux_vector) = nelms_on; + + hypre_TFree(is_on_proc, HYPRE_MEMORY_DEVICE); + + /* sort and reduce */ + hypre_IJVectorAssembleSortAndReduce3(nelms_off, off_proc_i, off_proc_sora, off_proc_data, &new_nnz, &new_i, &new_data); + + hypre_TFree(off_proc_i, HYPRE_MEMORY_DEVICE); + hypre_TFree(off_proc_data, HYPRE_MEMORY_DEVICE); + hypre_TFree(off_proc_sora, HYPRE_MEMORY_DEVICE); + } - /* Note: the stack might have been changed in hypre_IJVectorAssembleOffProcValsPar, - * so must get the size and the pointers again */ - nelms = hypre_AuxParVectorCurrentStackElmts(aux_vector); - stack_i = hypre_AuxParVectorStackI(aux_vector); - stack_data = hypre_AuxParVectorStackData(aux_vector); - stack_sora = hypre_AuxParVectorStackSorA(aux_vector); + /* send new_i/data to remote processes and the receivers call addtovalues */ + hypre_IJVectorAssembleOffProcValsPar(vector, -1, new_nnz, HYPRE_MEMORY_DEVICE, new_i, new_data); + + hypre_TFree(new_i, HYPRE_MEMORY_DEVICE); + hypre_TFree(new_data, HYPRE_MEMORY_DEVICE); + } + + /* Note: the stack might have been changed in hypre_IJVectorAssembleOffProcValsPar, + * so must get the size and the pointers again */ + nelms = hypre_AuxParVectorCurrentStackElmts(aux_vector); + stack_i = hypre_AuxParVectorStackI(aux_vector); + stack_data = hypre_AuxParVectorStackData(aux_vector); + stack_sora = hypre_AuxParVectorStackSorA(aux_vector); #ifdef HYPRE_DEBUG - /* the stack should only have on-proc elements now */ - HYPRE_Int tmp = HYPRE_THRUST_CALL(count_if, stack_i, stack_i+nelms, pred); - hypre_assert(nelms == tmp); + /* the stack should only have on-proc elements now */ + HYPRE_Int tmp = HYPRE_THRUST_CALL(count_if, stack_i, stack_i+nelms, pred); + hypre_assert(nelms == tmp); #endif - if (nelms) - { - HYPRE_Int new_nnz; - HYPRE_BigInt *new_i; - HYPRE_Complex *new_data; - char *new_sora; - - /* sort and reduce */ - hypre_IJVectorAssembleSortAndReduce1(nelms, stack_i, stack_sora, stack_data, &new_nnz, &new_i, &new_sora, &new_data); - - /* set/add to local vector */ - dim3 bDim = hypre_GetDefaultCUDABlockDimension(); - dim3 gDim = hypre_GetDefaultCUDAGridDimension(new_nnz, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i, vec_start, new_sora, + if (nelms) + { + HYPRE_Int new_nnz; + HYPRE_BigInt *new_i; + HYPRE_Complex *new_data; + char *new_sora; + + /* sort and reduce */ + hypre_IJVectorAssembleSortAndReduce1(nelms, stack_i, stack_sora, stack_data, &new_nnz, &new_i, &new_sora, &new_data); + + /* set/add to local vector */ + dim3 bDim = hypre_GetDefaultCUDABlockDimension(); + dim3 gDim = hypre_GetDefaultCUDAGridDimension(new_nnz, "thread", bDim); + HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i, vec_start, new_sora, hypre_VectorData(hypre_ParVectorLocalVector(par_vector)) ); - hypre_TFree(new_i, HYPRE_MEMORY_DEVICE); - hypre_TFree(new_data, HYPRE_MEMORY_DEVICE); - hypre_TFree(new_sora, HYPRE_MEMORY_DEVICE); + hypre_TFree(new_i, HYPRE_MEMORY_DEVICE); + hypre_TFree(new_data, HYPRE_MEMORY_DEVICE); + hypre_TFree(new_sora, HYPRE_MEMORY_DEVICE); + } } - hypre_AuxParVectorDestroy(aux_vector); hypre_IJVectorTranslator(vector) = NULL; return hypre_error_flag; } +/* helper routine used in hypre_IJVectorAssembleParCSRDevice: + * 1. sort A0 with key I0 + * 2. reduce A0 [with sum] + * N0: input size; N1: size after reduction (<= N0) + * Note: (I1, A1) are not resized to N1 but have size N0 + */ +HYPRE_Int +hypre_IJVectorAssembleSortAndReduce0(HYPRE_Int N, HYPRE_Int N0, HYPRE_BigInt *I0, HYPRE_Complex *A0, + HYPRE_Int *N1, HYPRE_BigInt **I1, HYPRE_Complex **A1 ) +{ + HYPRE_THRUST_CALL( stable_sort_by_key, + I0 + N, + I0 + N0, + A0 + N); + + HYPRE_BigInt *I = hypre_TAlloc(HYPRE_BigInt, N0, HYPRE_MEMORY_DEVICE); + HYPRE_Complex *A = hypre_TAlloc(HYPRE_Complex, N0, HYPRE_MEMORY_DEVICE); + + hypre_TMemcpy(I, I0, HYPRE_BigInt, N, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(A, A0, HYPRE_Complex, N, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + + auto new_end = HYPRE_THRUST_CALL( + reduce_by_key, + I0 + N, /* keys_first */ + I0 + N0, /* keys_last */ + A0 + N, /* values_first */ + I + N, /* keys_output */ + A + N, /* values_output */ + thrust::equal_to(), /* binary_pred */ + thrust::plus() /* binary_op */); + + *N1 = new_end.first - I; + *I1 = I; + *A1 = A; + + return hypre_error_flag; +} + /* helper routine used in hypre_IJVectorAssembleParCSRDevice: * 1. sort (X0, A0) with key I0 * 2. for each segment in I0, zero out in A0 all before the last `set' @@ -300,6 +442,7 @@ hypre_IJVectorAssembleSortAndReduce1(HYPRE_Int N0, HYPRE_BigInt *I0, char *X0 return hypre_error_flag; } + HYPRE_Int hypre_IJVectorAssembleSortAndReduce3(HYPRE_Int N0, HYPRE_BigInt *I0, char *X0, HYPRE_Complex *A0, HYPRE_Int *N1, HYPRE_BigInt **I1, HYPRE_Complex **A1) @@ -361,4 +504,19 @@ hypreCUDAKernel_IJVectorAssemblePar(HYPRE_Int n, HYPRE_Complex *x, HYPRE_BigInt } } +/* y[map[i]-offset] = x[i] or y[map[i]] += x[i] depending on SorA, + * same index cannot appear more than once in map */ +__global__ void +hypreCUDAKernel_IJVectorAssembleParSet(HYPRE_Int m, HYPRE_Int n, HYPRE_Complex *x, HYPRE_BigInt *map, HYPRE_BigInt offset, HYPRE_Complex *y) +{ + HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>(); + + if (i >= n) + { + return; + } + + y[map[i+m]-offset] += x[i+m]; +} + #endif diff --git a/src/IJ_mv/_hypre_IJ_mv.h b/src/IJ_mv/_hypre_IJ_mv.h index 6fd48c2666..84b69d35df 100644 --- a/src/IJ_mv/_hypre_IJ_mv.h +++ b/src/IJ_mv/_hypre_IJ_mv.h @@ -86,9 +86,12 @@ typedef struct HYPRE_BigInt *stack_i; HYPRE_BigInt *stack_j; HYPRE_Complex *stack_data; - char *stack_sora; /* Set (1) or Add (0) */ - HYPRE_Int usr_on_proc_elmts; /* user given num elmt on-proc */ - HYPRE_Int usr_off_proc_elmts; /* user given num elmt off-proc */ + char *stack_sora; /* Set (1) or Add (0) */ + HYPRE_Int usr_off_proc_elmts; /* user given num elmt off-proc */ + HYPRE_Int usr_on_proc_elmts; /* user given num elmt on-proc */ + HYPRE_Int usr_off_proc_send_elmts; /* user given num elmts to send to another process */ + HYPRE_Int usr_off_proc_recv_elmts; /* user given num elmts to receive from all other processes */ + char usr_elmts_filled; /* flag for indicating that the external elements have been filled from external (user) calls to Set/AddToValues calls */ HYPRE_Real init_alloc_factor; HYPRE_Real grow_factor; #endif @@ -133,6 +136,9 @@ typedef struct #define hypre_AuxParCSRMatrixStackSorA(matrix) ((matrix) -> stack_sora) #define hypre_AuxParCSRMatrixUsrOnProcElmts(matrix) ((matrix) -> usr_on_proc_elmts) #define hypre_AuxParCSRMatrixUsrOffProcElmts(matrix) ((matrix) -> usr_off_proc_elmts) +#define hypre_AuxParCSRMatrixUsrOffProcSendElmts(matrix) ((matrix) -> usr_off_proc_send_elmts) +#define hypre_AuxParCSRMatrixUsrOffProcRecvElmts(matrix) ((matrix) -> usr_off_proc_recv_elmts) +#define hypre_AuxParCSRMatrixUsrElmtsFilled(matrix) ((matrix) -> usr_elmts_filled) #define hypre_AuxParCSRMatrixInitAllocFactor(matrix) ((matrix) -> init_alloc_factor) #define hypre_AuxParCSRMatrixGrowFactor(matrix) ((matrix) -> grow_factor) #endif @@ -176,7 +182,11 @@ typedef struct HYPRE_BigInt *stack_i; /* contains row indices */ HYPRE_Complex *stack_data; /* contains corresponding data */ char *stack_sora; - HYPRE_Int usr_off_proc_elmts; /* the num of off-proc elements usr guided */ + HYPRE_Int usr_off_proc_elmts; /* user given num elmt off-proc */ + HYPRE_Int usr_on_proc_elmts; /* user given num elmt on-proc */ + HYPRE_Int usr_off_proc_send_elmts; /* user given num elmts to send to another process */ + HYPRE_Int usr_off_proc_recv_elmts; /* user given num elmts to receive from all other processes */ + char usr_elmts_filled; /* flag for indicating that the external elements have been filled from external (user) calls to Set/AddToValues calls */ HYPRE_Real init_alloc_factor; HYPRE_Real grow_factor; #endif @@ -200,6 +210,10 @@ typedef struct #define hypre_AuxParVectorStackData(vector) ((vector) -> stack_data) #define hypre_AuxParVectorStackSorA(vector) ((vector) -> stack_sora) #define hypre_AuxParVectorUsrOffProcElmts(vector) ((vector) -> usr_off_proc_elmts) +#define hypre_AuxParVectorUsrOnProcElmts(vector) ((vector) -> usr_on_proc_elmts) +#define hypre_AuxParVectorUsrOffProcSendElmts(vector) ((vector) -> usr_off_proc_send_elmts) +#define hypre_AuxParVectorUsrOffProcRecvElmts(vector) ((vector) -> usr_off_proc_recv_elmts) +#define hypre_AuxParVectorUsrElmtsFilled(vector) ((vector) -> usr_elmts_filled) #define hypre_AuxParVectorInitAllocFactor(vector) ((vector) -> init_alloc_factor) #define hypre_AuxParVectorGrowFactor(vector) ((vector) -> grow_factor) #endif @@ -425,6 +439,9 @@ HYPRE_Int hypre_IJMatrixCreateParCSR ( hypre_IJMatrix *matrix ); HYPRE_Int hypre_IJMatrixSetRowSizesParCSR ( hypre_IJMatrix *matrix , const HYPRE_Int *sizes ); HYPRE_Int hypre_IJMatrixSetDiagOffdSizesParCSR ( hypre_IJMatrix *matrix , const HYPRE_Int *diag_sizes , const HYPRE_Int *offdiag_sizes ); HYPRE_Int hypre_IJMatrixSetMaxOffProcElmtsParCSR ( hypre_IJMatrix *matrix , HYPRE_Int max_off_proc_elmts ); +HYPRE_Int hypre_IJMatrixSetMaxOnProcElmtsParCSR ( hypre_IJMatrix *matrix , HYPRE_Int max_on_proc_elmts ); +HYPRE_Int hypre_IJMatrixSetOffProcSendElmtsParCSR ( hypre_IJMatrix *matrix , HYPRE_Int off_proc_send_elmts ); +HYPRE_Int hypre_IJMatrixSetOffProcRecvElmtsParCSR ( hypre_IJMatrix *matrix , HYPRE_Int off_proc_recv_elmts ); HYPRE_Int hypre_IJMatrixInitializeParCSR ( hypre_IJMatrix *matrix ); HYPRE_Int hypre_IJMatrixGetRowCountsParCSR ( hypre_IJMatrix *matrix , HYPRE_Int nrows , HYPRE_BigInt *rows , HYPRE_Int *ncols ); HYPRE_Int hypre_IJMatrixGetValuesParCSR ( hypre_IJMatrix *matrix , HYPRE_Int nrows , HYPRE_Int *ncols , HYPRE_BigInt *rows , HYPRE_BigInt *cols , HYPRE_Complex *values ); @@ -470,6 +487,9 @@ HYPRE_Int hypre_IJVectorDestroyPar ( hypre_IJVector *vector ); HYPRE_Int hypre_IJVectorInitializePar ( hypre_IJVector *vector ); HYPRE_Int hypre_IJVectorInitializePar_v2(hypre_IJVector *vector, HYPRE_MemoryLocation memory_location); HYPRE_Int hypre_IJVectorSetMaxOffProcElmtsPar ( hypre_IJVector *vector , HYPRE_Int max_off_proc_elmts ); +HYPRE_Int hypre_IJVectorSetMaxOnProcElmtsPar ( hypre_IJVector *vector , HYPRE_Int max_on_proc_elmts ); +HYPRE_Int hypre_IJVectorSetOffProcSendElmtsPar ( hypre_IJVector *vector , HYPRE_Int off_proc_send_elmts ); +HYPRE_Int hypre_IJVectorSetOffProcRecvElmtsPar ( hypre_IJVector *vector , HYPRE_Int off_proc_recv_elmts ); HYPRE_Int hypre_IJVectorDistributePar ( hypre_IJVector *vector , const HYPRE_Int *vec_starts ); HYPRE_Int hypre_IJVectorZeroValuesPar ( hypre_IJVector *vector ); HYPRE_Int hypre_IJVectorSetValuesPar ( hypre_IJVector *vector , HYPRE_Int num_values , const HYPRE_BigInt *indices , const HYPRE_Complex *values ); @@ -498,6 +518,9 @@ HYPRE_Int HYPRE_IJMatrixGetObject ( HYPRE_IJMatrix matrix , void **object ); HYPRE_Int HYPRE_IJMatrixSetRowSizes ( HYPRE_IJMatrix matrix , const HYPRE_Int *sizes ); HYPRE_Int HYPRE_IJMatrixSetDiagOffdSizes ( HYPRE_IJMatrix matrix , const HYPRE_Int *diag_sizes , const HYPRE_Int *offdiag_sizes ); HYPRE_Int HYPRE_IJMatrixSetMaxOffProcElmts ( HYPRE_IJMatrix matrix , HYPRE_Int max_off_proc_elmts ); +HYPRE_Int HYPRE_IJMatrixSetMaxOnProcElmts ( HYPRE_IJMatrix matrix , HYPRE_Int max_on_proc_elmts ); +HYPRE_Int HYPRE_IJMatrixSetOffProcSendElmts ( HYPRE_IJMatrix matrix , HYPRE_Int off_proc_send_elmts ); +HYPRE_Int HYPRE_IJMatrixSetOffProcRecvElmts ( HYPRE_IJMatrix matrix , HYPRE_Int off_proc_recv_elmts ); HYPRE_Int HYPRE_IJMatrixRead ( const char *filename , MPI_Comm comm , HYPRE_Int type , HYPRE_IJMatrix *matrix_ptr ); HYPRE_Int HYPRE_IJMatrixPrint ( HYPRE_IJMatrix matrix , const char *filename ); HYPRE_Int HYPRE_IJMatrixSetOMPFlag ( HYPRE_IJMatrix matrix , HYPRE_Int omp_flag ); @@ -512,6 +535,9 @@ HYPRE_Int HYPRE_IJVectorAddToValues ( HYPRE_IJVector vector , HYPRE_Int nvalues HYPRE_Int HYPRE_IJVectorAssemble ( HYPRE_IJVector vector ); HYPRE_Int HYPRE_IJVectorGetValues ( HYPRE_IJVector vector , HYPRE_Int nvalues , const HYPRE_BigInt *indices , HYPRE_Complex *values ); HYPRE_Int HYPRE_IJVectorSetMaxOffProcElmts ( HYPRE_IJVector vector , HYPRE_Int max_off_proc_elmts ); +HYPRE_Int HYPRE_IJVectorSetMaxOnProcElmts ( HYPRE_IJVector vector , HYPRE_Int max_on_proc_elmts ); +HYPRE_Int HYPRE_IJVectorSetOffProcSendElmts ( HYPRE_IJVector vector , HYPRE_Int off_proc_send_elmts ); +HYPRE_Int HYPRE_IJVectorSetOffProcRecvElmts ( HYPRE_IJVector vector , HYPRE_Int off_proc_recv_elmts ); HYPRE_Int HYPRE_IJVectorSetObjectType ( HYPRE_IJVector vector , HYPRE_Int type ); HYPRE_Int HYPRE_IJVectorGetObjectType ( HYPRE_IJVector vector , HYPRE_Int *type ); HYPRE_Int HYPRE_IJVectorGetLocalRange ( HYPRE_IJVector vector , HYPRE_BigInt *jlower , HYPRE_BigInt *jupper ); diff --git a/src/IJ_mv/aux_par_vector.c b/src/IJ_mv/aux_par_vector.c index a2e20f5669..e99bfc0893 100644 --- a/src/IJ_mv/aux_par_vector.c +++ b/src/IJ_mv/aux_par_vector.c @@ -39,6 +39,10 @@ hypre_AuxParVectorCreate( hypre_AuxParVector **aux_vector) hypre_AuxParVectorStackData(vector) = NULL; hypre_AuxParVectorStackSorA(vector) = NULL; hypre_AuxParVectorUsrOffProcElmts(vector) = -1; + hypre_AuxParVectorUsrOnProcElmts(vector) = -1; + hypre_AuxParVectorUsrOffProcSendElmts(vector) = -1; + hypre_AuxParVectorUsrOffProcRecvElmts(vector) = -1; + hypre_AuxParVectorUsrElmtsFilled(vector) = 0; hypre_AuxParVectorInitAllocFactor(vector) = 1.5; hypre_AuxParVectorGrowFactor(vector) = 2.0; #endif @@ -62,9 +66,15 @@ hypre_AuxParVectorDestroy( hypre_AuxParVector *vector ) hypre_TFree(hypre_AuxParVectorOffProcData(vector), HYPRE_MEMORY_HOST); #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - hypre_TFree(hypre_AuxParVectorStackI(vector), hypre_AuxParVectorMemoryLocation(vector)); - hypre_TFree(hypre_AuxParVectorStackData(vector), hypre_AuxParVectorMemoryLocation(vector)); - hypre_TFree(hypre_AuxParVectorStackSorA(vector), hypre_AuxParVectorMemoryLocation(vector)); + if (hypre_AuxParVectorUsrElmtsFilled(vector)==0) { + hypre_TFree(hypre_AuxParVectorStackI(vector), hypre_AuxParVectorMemoryLocation(vector)); + hypre_TFree(hypre_AuxParVectorStackData(vector), hypre_AuxParVectorMemoryLocation(vector)); + hypre_TFree(hypre_AuxParVectorStackSorA(vector), hypre_AuxParVectorMemoryLocation(vector)); + } else { + hypre_AuxParVectorStackI(vector) = NULL; + hypre_AuxParVectorStackData(vector) = NULL; + hypre_AuxParVectorStackSorA(vector) = NULL; + } #endif hypre_TFree(vector, HYPRE_MEMORY_HOST); diff --git a/src/IJ_mv/aux_parcsr_matrix.c b/src/IJ_mv/aux_parcsr_matrix.c index 83a150bc30..e2fd4ffd8d 100644 --- a/src/IJ_mv/aux_parcsr_matrix.c +++ b/src/IJ_mv/aux_parcsr_matrix.c @@ -59,8 +59,11 @@ hypre_AuxParCSRMatrixCreate( hypre_AuxParCSRMatrix **aux_matrix, hypre_AuxParCSRMatrixStackJ(matrix) = NULL; hypre_AuxParCSRMatrixStackData(matrix) = NULL; hypre_AuxParCSRMatrixStackSorA(matrix) = NULL; - hypre_AuxParCSRMatrixUsrOnProcElmts(matrix) = -1; hypre_AuxParCSRMatrixUsrOffProcElmts(matrix) = -1; + hypre_AuxParCSRMatrixUsrOnProcElmts(matrix) = -1; + hypre_AuxParCSRMatrixUsrOffProcSendElmts(matrix) = -1; + hypre_AuxParCSRMatrixUsrOffProcRecvElmts(matrix) = -1; + hypre_AuxParCSRMatrixUsrElmtsFilled(matrix) = 0; hypre_AuxParCSRMatrixInitAllocFactor(matrix) = 5.0; hypre_AuxParCSRMatrixGrowFactor(matrix) = 2.0; #endif @@ -143,10 +146,17 @@ hypre_AuxParCSRMatrixDestroy( hypre_AuxParCSRMatrix *matrix ) hypre_TFree(hypre_AuxParCSRMatrixOffProcData(matrix), HYPRE_MEMORY_HOST); #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - hypre_TFree(hypre_AuxParCSRMatrixStackI(matrix), hypre_AuxParCSRMatrixMemoryLocation(matrix)); - hypre_TFree(hypre_AuxParCSRMatrixStackJ(matrix), hypre_AuxParCSRMatrixMemoryLocation(matrix)); - hypre_TFree(hypre_AuxParCSRMatrixStackData(matrix), hypre_AuxParCSRMatrixMemoryLocation(matrix)); - hypre_TFree(hypre_AuxParCSRMatrixStackSorA(matrix), hypre_AuxParCSRMatrixMemoryLocation(matrix)); + if (hypre_AuxParCSRMatrixUsrElmtsFilled(matrix)==0) { + hypre_TFree(hypre_AuxParCSRMatrixStackI(matrix), hypre_AuxParCSRMatrixMemoryLocation(matrix)); + hypre_TFree(hypre_AuxParCSRMatrixStackJ(matrix), hypre_AuxParCSRMatrixMemoryLocation(matrix)); + hypre_TFree(hypre_AuxParCSRMatrixStackData(matrix), hypre_AuxParCSRMatrixMemoryLocation(matrix)); + hypre_TFree(hypre_AuxParCSRMatrixStackSorA(matrix), hypre_AuxParCSRMatrixMemoryLocation(matrix)); + } else { + hypre_AuxParCSRMatrixStackI(matrix) = NULL; + hypre_AuxParCSRMatrixStackJ(matrix) = NULL; + hypre_AuxParCSRMatrixStackData(matrix) = NULL; + hypre_AuxParCSRMatrixStackSorA(matrix) = NULL; + } #endif hypre_TFree(matrix, HYPRE_MEMORY_HOST);