From 63cf008ce2740a951a5395b08a6f887e034ddf2d Mon Sep 17 00:00:00 2001
From: Hui Zhou <hzhou321@anl.gov>
Date: Mon, 5 Feb 2024 21:32:13 -0600
Subject: [PATCH] ch4/ofi: use explicit counters to track gpu pipeline

Don't mix the usage of cc_ptr, use separate and explicit counters to
track the progress and completion of chunks.
---
 src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c | 30 ++++++++++++----------
 src/mpid/ch4/netmod/ofi/ofi_pre.h          |  3 +++
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c
index 388bf2f2f5d..101fdae7a71 100644
--- a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c
+++ b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c
@@ -59,6 +59,7 @@ int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf,
     MPIDI_OFI_idata_set_gpuchunk_bits(&cq_data, n_chunks);
     MPIDI_OFI_idata_set_gpu_packed_bit(&cq_data, is_packed);
 
+    MPIDI_OFI_REQUEST(sreq, pipeline_info.send.num_remain) = n_chunks;
     MPIDI_OFI_REQUEST(sreq, pipeline_info.send.cq_data) = cq_data;
     MPIDI_OFI_REQUEST(sreq, pipeline_info.send.remote_addr) = remote_addr;
     MPIDI_OFI_REQUEST(sreq, pipeline_info.send.vci_local) = vci_local;
@@ -123,9 +124,6 @@ static int send_alloc_poll(MPIR_Async_thing * thing)
         p->offset += (size_t) chunk_sz;
         p->left_sz -= (size_t) chunk_sz;
         p->n_chunks++;
-        /* Increase request completion cnt, cc is 1 more than necessary
-         * to prevent parent request being freed prematurally. */
-        MPIR_cc_inc(p->sreq->cc_ptr);
 
         spawn_send_copy(thing, p->sreq, &async_req, host_buf, chunk_sz);
 
@@ -228,11 +226,10 @@ int MPIDI_OFI_gpu_pipeline_send_event(struct fi_cq_tagged_entry *wc, MPIR_Reques
 
     MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_send_pool, host_buf);
 
-    int c;
-    MPIR_cc_decr(sreq->cc_ptr, &c);
-    if (c == 0) {
+    MPIDI_OFI_REQUEST(sreq, pipeline_info.send.num_remain) -= 1;
+    if (MPIDI_OFI_REQUEST(sreq, pipeline_info.send.num_remain) == 0) {
         MPIR_Datatype_release_if_not_builtin(MPIDI_OFI_REQUEST(sreq, datatype));
-        MPIR_Request_free(sreq);
+        MPIDI_Request_complete_fast(sreq);
     }
 
     return mpi_errno;
@@ -259,6 +256,8 @@ int MPIDI_OFI_gpu_pipeline_recv(MPIR_Request * rreq,
 
     /* The 1st recv is an empty chunk for matching. We need initialize rreq. */
     MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.offset) = 0;
+    MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_inrecv) = 0;
+    MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_remain) = 0;
     MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.is_sync) = false;
     MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.remote_addr) = remote_addr;
     MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.vci_local) = vci_local;
@@ -305,7 +304,8 @@ static int recv_alloc_poll(MPIR_Async_thing * thing)
     struct recv_alloc *p = MPIR_Async_thing_get_state(thing);
     MPIR_Request *rreq = p->rreq;
 
-    if (MPIR_cc_get(rreq->cc) > 1) {
+    /* arbitary threshold */
+    if (MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_inrecv) > 1) {
         return MPIR_ASYNC_THING_NOPROGRESS;
     }
 
@@ -339,6 +339,7 @@ static int recv_alloc_poll(MPIR_Async_thing * thing)
                        match_bits, mask_bits, (void *) &chunk_req->context);
     MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(vci).lock);
     if (ret == 0) {
+        MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_inrecv) += 1;
         free(p);
         /* chunk_req and host_buf will be freed in recv_events */
         return MPIR_ASYNC_THING_DONE;
@@ -382,17 +383,18 @@ int MPIDI_OFI_gpu_pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Reques
         uint32_t packed = MPIDI_OFI_idata_get_gpu_packed_bit(wc->data);
         uint32_t n_chunks = MPIDI_OFI_idata_get_gpuchunk_bits(wc->data);
         /* ? - Not sure why sender cannot send packed data */
-        MPIR_Assertp(packed == 0);
+        MPIR_Assert(packed == 0);
         if (wc->len > 0) {
             /* message from a normal send */
             MPIR_Assert(n_chunks == 0);
+            MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_remain) = 1;
             mpi_errno = start_recv_copy(rreq, host_buf, wc->len, recv_buf, recv_count, datatype);
             MPIR_ERR_CHECK(mpi_errno);
         } else {
             MPIR_Assert(n_chunks > 0);
+            MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_remain) = n_chunks;
             /* There is no data in the init chunk, free the buffer */
             MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, host_buf);
-            MPIR_cc_dec(rreq->cc_ptr);
             /* Post recv for the remaining chunks. */
             for (int i = 0; i < n_chunks; i++) {
                 mpi_errno = start_recv_chunk(rreq, i, n_chunks);
@@ -401,6 +403,7 @@ int MPIDI_OFI_gpu_pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Reques
         }
     } else {
         MPIR_Assert(event_id == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE);
+        MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_inrecv) -= 1;
         mpi_errno = start_recv_copy(rreq, host_buf, wc->len, recv_buf, recv_count, datatype);
         MPIR_ERR_CHECK(mpi_errno);
     }
@@ -478,9 +481,8 @@ static int recv_copy_poll(MPIR_Async_thing * thing)
 static void recv_copy_complete(MPIR_Request * rreq, void *buf)
 {
     int mpi_errno = MPI_SUCCESS;
-    int c;
-    MPIR_cc_decr(rreq->cc_ptr, &c);
-    if (c == 0) {
+    MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_remain) -= 1;
+    if (MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_remain) == 0) {
         /* all chunks arrived and copied */
         if (unlikely(MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.is_sync))) {
             MPIR_Comm *comm = rreq->comm;
@@ -511,7 +513,7 @@ static void recv_copy_complete(MPIR_Request * rreq, void *buf)
         /* Set number of bytes in status. */
         MPIR_STATUS_SET_COUNT(rreq->status, MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.offset));
 
-        MPIR_Request_free(rreq);
+        MPIDI_Request_complete_fast(rreq);
     }
 
     /* Free host buffer, yaksa request and task. */
diff --git a/src/mpid/ch4/netmod/ofi/ofi_pre.h b/src/mpid/ch4/netmod/ofi/ofi_pre.h
index bfc9ea5b53c..07b999ca808 100644
--- a/src/mpid/ch4/netmod/ofi/ofi_pre.h
+++ b/src/mpid/ch4/netmod/ofi/ofi_pre.h
@@ -223,6 +223,7 @@ typedef struct {
             fi_addr_t remote_addr;
             uint64_t cq_data;
             uint64_t match_bits;
+            int num_remain;
         } send;
         struct {
             int vci_local;
@@ -231,6 +232,8 @@ typedef struct {
             uint64_t match_bits;
             uint64_t mask_bits;
             MPI_Aint offset;
+            int num_inrecv;
+            int num_remain;
             bool is_sync;
         } recv;
     } pipeline_info;            /* GPU pipeline */