gpudirect · e-ago · Jan 28, 2019 · Jan 30, 2019 · drossetti · Jan 28, 2019
diff --git a/tests/gds_kernel_latency.c b/tests/gds_kernel_latency.c
@@ -138,27 +138,28 @@ struct pingpong_context {
         int                     rxtot_size;
         size_t                  alloc_size;
         char                    *rx_flag;
-        int                      buf_size;
-        int                      buf_sizeexp;
-        int                      calc_size;
-        int                      rx_depth;
-        int                      pending;
-        struct ibv_port_attr     portinfo;
-        int                      gpu_id;
-        int                      kernel_duration;
-        int                      peersync;
-        int                      peersync_gpu_cq;
-        int                      consume_rx_cqe;
-        int                      gpumem;
-        int                      use_desc_apis;
-        int                      n_tx_ev;
-        int                      n_rx_ev;
-        int                      scnt;
-        int                      rcnt;
-        int                      skip_kernel_launch;
-        int                      exp_send_info;
-        int                      validate;
-        char                     *validate_buf;
+        int                     buf_size;
+        int                     size_align;
+        int                     buf_sizeexp;
+        int                     calc_size;
+        int                     rx_depth;
+        int                     pending;
+        struct ibv_port_attr    portinfo;
+        int                     gpu_id;
+        int                     kernel_duration;
+        int                     peersync;
+        int                     peersync_gpu_cq;
+        int                     consume_rx_cqe;
+        int                     gpumem;
+        int                     use_desc_apis;
+        int                     n_tx_ev;
+        int                     n_rx_ev;
+        int                     scnt;
+        int                     rcnt;
+        int                     skip_kernel_launch;
+        int                     exp_send_info;
+        int                     validate;
+        char                    *validate_buf;
 };
 
 static int my_rank, comm_size;
@@ -192,6 +193,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
                                             int validate)
 {
         struct pingpong_context *ctx;
+        int i = 0;
 
         if (gpu_id >=0 && gpu_init(gpu_id, sched_mode)) {
                 fprintf(stderr, "error in GPU init.\n");
@@ -203,6 +205,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
                 return NULL;
 
         ctx->buf_size           = size;
+        ctx->size_align         = align_to(ctx->buf_size + 40, page_size);
         ctx->buf_sizeexp        = size/2;
         ctx->calc_size          = calc_size;
         ctx->rx_depth           = rx_depth;
@@ -218,7 +221,7 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
         ctx->txbufexp_lkey      = NULL;
         ctx->txbufexp_addr      = NULL;
 
-        size_t alloc_size = max_batch_len * align_to(size + 40, page_size);
+        size_t alloc_size = max_batch_len * ctx->size_align;
         ctx->rxtot_size = alloc_size;
         ctx->txtot_size = ctx->rxtot_size;
 
@@ -228,28 +231,29 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
                 printf("allocated GPU memory at txbuf=%p (size %d), rxbuf=%p (size=%d)\n", ctx->txbuf, ctx->txtot_size, ctx->rxbuf, ctx->rxtot_size);
                 if(ctx->exp_send_info == 1)
                 {
-                        ctx->txbufexp = gpu_malloc(page_size, ctx->txtot_size);
-                        ctx->txbufexp_size = (uint32_t*)gpu_malloc(page_size, sizeof(uint32_t)*max_batch_len);
-                        ctx->txbufexp_lkey = (uint32_t*)gpu_malloc(page_size, sizeof(uint32_t)*max_batch_len);
-                        ctx->txbufexp_addr = (uintptr_t*)gpu_malloc(page_size, sizeof(uintptr_t)*max_batch_len);
+                        ctx->txbufexp           = gpu_malloc(page_size, ctx->txtot_size);
+                        ctx->txbufexp_size      = (uint32_t*)gpu_malloc(page_size, sizeof(uint32_t)*max_batch_len);
+                        ctx->txbufexp_lkey      = (uint32_t*)gpu_malloc(page_size, sizeof(uint32_t)*max_batch_len);
+                        ctx->txbufexp_addr      = (uintptr_t*)gpu_malloc(page_size, sizeof(uintptr_t)*max_batch_len);
                 }
         } else {
-                ctx->txbuf = memalign(page_size, ctx->txtot_size); //posix_memalign
+                ctx->txbuf = memalign(page_size, ctx->txtot_size);
                 ctx->rxbuf = memalign(page_size, ctx->rxtot_size);
                 printf("allocated CPU memory at txbuf=%p, rxbuf=%p\n", ctx->txbuf, ctx->rxbuf);
                 if(ctx->exp_send_info == 1)
                 {
-                        ctx->txbufexp = memalign(page_size, ctx->txtot_size);
-                        ctx->txbufexp_size = (uint32_t*)memalign(page_size, sizeof(uint32_t)*max_batch_len);
-                        ctx->txbufexp_lkey = (uint32_t*)memalign(page_size, sizeof(uint32_t)*max_batch_len);
-                        ctx->txbufexp_addr = (uintptr_t*)memalign(page_size, sizeof(uintptr_t)*max_batch_len);
+                        ctx->txbufexp           = memalign(page_size, ctx->txtot_size);
+                        ctx->txbufexp_size      = memalign(page_size, sizeof(uint32_t)*max_batch_len);
+                        ctx->txbufexp_lkey      = memalign(page_size, sizeof(uint32_t)*max_batch_len);
+                        ctx->txbufexp_addr      = memalign(page_size, sizeof(uintptr_t)*max_batch_len);
                 }
         }
 
         if (!ctx->txbuf || !ctx->rxbuf) {
                 fprintf(stderr, "Couldn't allocate work buf.\n");
                 goto clean_ctx;
         }
+
         if(ctx->exp_send_info == 1)
         {
                 if(!ctx->txbufexp)
@@ -286,17 +290,11 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
                 }
         }
 
-        //gpu_info("allocated ctx buffer %p\n", ctx->buf);
-        // ctx->rxbuf = (char*)ctx->buf;
-        // ctx->txbuf = (char*)ctx->buf + align_to(size + 40, page_size);
-
         gpu_info("txbuf address 0x%lx\n", ctx->txbuf);
         if(ctx->exp_send_info == 1)
                 gpu_info("txbufexp address 0x%lx\n", ctx->txbufexp);
 
-        //ctx->rx_flag = (char*)ctx->buf + 2 * align_to(size + 40, page_size);
-
-        ctx->rx_flag =  memalign(page_size, alloc_size);
+        ctx->rx_flag = memalign(page_size, alloc_size);
         if (!ctx->rx_flag) {
                 gpu_err("Couldn't allocate rx_flag buf\n");  
                 goto clean_ctx;
@@ -312,13 +310,11 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
         {
                 gpu_memset(ctx->txbuf, 0, ctx->txtot_size);
                 gpu_memset(ctx->rxbuf, 0, ctx->rxtot_size);
-                //gpu_memset(ctx->buf, 0, alloc_size);
         }
         else
         {
                 memset(ctx->txbuf, 0, ctx->txtot_size);
                 memset(ctx->rxbuf, 0, ctx->rxtot_size);
-                //memset(ctx->buf, 0, alloc_size);
         }
 
         memset(ctx->rx_flag, 0, alloc_size);
@@ -372,34 +368,34 @@ static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
                         goto clean_pd;
                 }
 
-                if (ctx->gpumem) {
-                        gpu_memset32(ctx->txbufexp_size, ctx->buf_sizeexp, 1);
-                        gpu_memset32(ctx->txbufexp_lkey, ctx->mrexp->lkey, 1);
-
-                        uint32_t tmp_addr[2];
-                        ((uintptr_t*)tmp_addr)[0] = (uintptr_t)(ctx->txbufexp);
-                        CUDACHECK(cudaMemcpy( 
-                                (uint32_t*)ctx->txbufexp_addr,
-                                (uint32_t*)tmp_addr,
-                                2*sizeof(uint32_t),
-                                cudaMemcpyDefault
-                                ));
-
-                        gpu_memset(ctx->txbufexp, 0, ctx->txtot_size);
-                }
-                else {
-                        ctx->txbufexp_size[0] = ctx->buf_sizeexp;
-                        ctx->txbufexp_lkey[0] = ctx->mrexp->lkey;
-                        ctx->txbufexp_addr[0]=(uintptr_t)(ctx->txbufexp);
-                        gpu_info("exp_send_info - hostmem: new tx size: %d instead of %d. New tx addr: %lx instead of %lx\n", 
-                                ctx->txbufexp_size[0], ctx->buf_size, ctx->txbufexp_addr[0], ctx->txbuf);
-
-                        memset(ctx->txbufexp, 0, ctx->txtot_size);
-
+                for(i=0; i < max_batch_len; i++)
+                {
+                        if (ctx->gpumem) {
+                                gpu_memset32(&(ctx->txbufexp_size[i]), ctx->buf_sizeexp, 1);
+                                gpu_memset32(&(ctx->txbufexp_lkey[i]), ctx->mrexp->lkey, 1);
+
+                                uint32_t tmp_addr[2];
+                                ((uintptr_t*)tmp_addr)[0] = (uintptr_t)(ctx->txbufexp+(i*ctx->size_align));
+                                CUDACHECK(cudaMemcpy( 
+                                        (uint32_t*)&(ctx->txbufexp_addr[i]),
+                                        (uint32_t*)tmp_addr,
+                                        2*sizeof(uint32_t),
+                                        cudaMemcpyDefault
+                                        ));
+
+                                gpu_memset(ctx->txbufexp, 0, ctx->txtot_size);
+                        }
+                        else {
+                                ctx->txbufexp_size[i] = ctx->buf_sizeexp;
+                                ctx->txbufexp_lkey[i] = ctx->mrexp->lkey;
+                                ctx->txbufexp_addr[i]=(uintptr_t)(ctx->txbufexp+(i*ctx->size_align));
+                                gpu_info("exp_send_info - hi=%d, ostmem: new tx size: %d instead of %d. New tx addr: %lx instead of %lx\n", 
+                                        ctx->txbufexp_size[i], ctx->buf_size, ctx->txbufexp_addr[i], ctx->txbuf);
+
+                                memset(ctx->txbufexp, 0, ctx->txtot_size);
+                        }
                 }
         }
-
-
 
         int gds_flags = 0;
         if (peersync_gpu_cq)
@@ -522,6 +518,7 @@ int pp_close_ctx(struct pingpong_context *ctx)
         }
 
         if (IBV_QPT_UD == gds_qpt) {
+
                 if (ibv_destroy_ah(ctx->ah)) {
                         gpu_err("Couldn't destroy AH\n");
                 }
@@ -532,6 +529,7 @@ int pp_close_ctx(struct pingpong_context *ctx)
         }
 
         if (ctx->channel) {
+
                 if (ibv_destroy_comp_channel(ctx->channel)) {
                         gpu_err("Couldn't destroy completion channel\n");
                 }
@@ -544,7 +542,6 @@ int pp_close_ctx(struct pingpong_context *ctx)
         if(ctx->validate)
                 free(ctx->validate_buf);
 
-
         if (ctx->gpumem)
         {
                 if( ctx->exp_send_info == 1 )
@@ -657,7 +654,7 @@ static int pp_post_recv(struct pingpong_context *ctx, int n)
         for (i = 0; i < n; ++i)
         {
                 struct ibv_sge list = {
-                        .addr   = (uintptr_t) (ctx->rxbuf+(i*align_to(ctx->buf_size + 40, page_size))),
+                        .addr   = (uintptr_t) (ctx->rxbuf+(i*ctx->size_align)),
                         .length = ctx->buf_size + 40, // good for IBV_QPT_UD
                         .lkey   = ctx->rxmr->lkey
                 };
@@ -721,7 +718,7 @@ static int pp_post_gpu_send(struct pingpong_context *ctx, int iteration, uint32_
 {
         int ret = 0;
         struct ibv_sge list = {
-                .addr   = (uintptr_t) (ctx->txbuf+(iteration*align_to(ctx->buf_size + 40, page_size))),
+                .addr   = (uintptr_t) (ctx->txbuf+(iteration*ctx->size_align)),
                 .length = ctx->buf_size,
                 .lkey   = ctx->txmr->lkey
         };
@@ -759,7 +756,7 @@ static int pp_prepare_gpu_send(struct pingpong_context *ctx, int iteration, uint
 {
         int ret = 0;
         struct ibv_sge list = {
-                .addr   = (uintptr_t) (ctx->txbuf+(iteration*align_to(ctx->buf_size + 40, page_size))),
+                .addr   = (uintptr_t) (ctx->txbuf+(iteration*ctx->size_align)),
                 .length = ctx->buf_size,
                 .lkey   = ctx->txmr->lkey
         };
@@ -778,7 +775,7 @@ static int pp_prepare_gpu_send(struct pingpong_context *ctx, int iteration, uint
                 },
                 .comp_mask = 0
         };
-        
+
         if (IBV_QPT_UD != gds_qpt) {
                 memset(&ewr, 0, sizeof(ewr));
                 ewr.num_sge = 1;
@@ -788,7 +785,7 @@ static int pp_prepare_gpu_send(struct pingpong_context *ctx, int iteration, uint
                 ewr.sg_list = &list;
                 ewr.next = NULL;
         }
-        
+
         if( ctx->exp_send_info == 1 )
                 ewr.exp_send_flags |= IBV_EXP_SEND_GET_INFO;
 
@@ -833,8 +830,6 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin
         int i, ret = 0;
         int posted_recv = 0;
 
-        //printf("post_work posting %d\n", n_posts);
-
         if (n_posts <= 0)
                 return 0;
 
@@ -853,27 +848,27 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin
 
         if(ctx->validate)
         {
+                //Useless?
+                cudaDeviceSynchronize();
                 if (ctx->gpumem)
                         gpu_memset(ctx->rxbuf, 0, ctx->rxtot_size);
                 else
                         memset(ctx->rxbuf, 0, ctx->rxtot_size);
 
-                //Useless?
-                cudaDeviceSynchronize();
                 for (i = 0; i < posted_recv; ++i)
                 {
                         if (ctx->gpumem)
                         {
-                                gpu_memset(ctx->txbuf+(i*align_to(ctx->buf_size + 40, page_size)), i%CHAR_MAX, ctx->buf_size);
+                                gpu_memset(ctx->txbuf+(i*ctx->size_align), i%CHAR_MAX, ctx->buf_size);
                                 //We need to cover the entire buffer
                                 if(ctx->exp_send_info)
-                                        gpu_memset(ctx->txbufexp+(i*align_to(ctx->buf_size + 40, page_size)), (i+1)%CHAR_MAX, ctx->buf_size);
+                                        gpu_memset(ctx->txbufexp+(i*ctx->size_align), (i+1)%CHAR_MAX, ctx->buf_size);
                         }
                         else
                         {
-                                memset(ctx->txbuf+(i*align_to(ctx->buf_size + 40, page_size)), i%CHAR_MAX, ctx->buf_size);
+                                memset(ctx->txbuf+(i*ctx->size_align), i%CHAR_MAX, ctx->buf_size);
                                 if(ctx->exp_send_info)
-                                        memset(ctx->txbufexp+(i*align_to(ctx->buf_size + 40, page_size)), (i+1)%CHAR_MAX, ctx->buf_size);
+                                        memset(ctx->txbufexp+(i*ctx->size_align), (i+1)%CHAR_MAX, ctx->buf_size);
                         }
                 }
         }
@@ -903,9 +898,9 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin
                     {
                         ret = gds_prepare_send_info(
                             &wdesc->send_rq,
-                            &(ctx->txbufexp_size[0]), (ctx->gpumem == 1) ? GDS_MEMORY_GPU : GDS_MEMORY_HOST,
-                            &(ctx->txbufexp_lkey[0]), (ctx->gpumem == 1) ? GDS_MEMORY_GPU : GDS_MEMORY_HOST,
-                            &(ctx->txbufexp_addr[0]), (ctx->gpumem == 1) ? GDS_MEMORY_GPU : GDS_MEMORY_HOST);
+                            &(ctx->txbufexp_size[i]), (ctx->gpumem == 1) ? GDS_MEMORY_GPU : GDS_MEMORY_HOST,
+                            &(ctx->txbufexp_lkey[i]), (ctx->gpumem == 1) ? GDS_MEMORY_GPU : GDS_MEMORY_HOST,
+                            &(ctx->txbufexp_addr[i]), (ctx->gpumem == 1) ? GDS_MEMORY_GPU : GDS_MEMORY_HOST);
 
                         if (ret) {
                             retcode = -ret;
@@ -1059,9 +1054,9 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin
                         {
                                 ret = gds_prepare_send_info(
                                         &wdesc->send_rq,
-                                        &(ctx->txbufexp_size[0]), (ctx->gpumem == 1) ? GDS_MEMORY_GPU : GDS_MEMORY_HOST,
-                                        &(ctx->txbufexp_lkey[0]), (ctx->gpumem == 1) ? GDS_MEMORY_GPU : GDS_MEMORY_HOST,
-                                        &(ctx->txbufexp_addr[0]), (ctx->gpumem == 1) ? GDS_MEMORY_GPU : GDS_MEMORY_HOST);
+                                        &(ctx->txbufexp_size[i]), (ctx->gpumem == 1) ? GDS_MEMORY_GPU : GDS_MEMORY_HOST,
+                                        &(ctx->txbufexp_lkey[i]), (ctx->gpumem == 1) ? GDS_MEMORY_GPU : GDS_MEMORY_HOST,
+                                        &(ctx->txbufexp_addr[i]), (ctx->gpumem == 1) ? GDS_MEMORY_GPU : GDS_MEMORY_HOST);
 
                                 if (ret) {
                                         retcode = -ret;
@@ -1135,7 +1130,7 @@ static int pp_post_work(struct pingpong_context *ctx, int n_posts, int rcnt, uin
                 //Useless??
                 MPI_Barrier(MPI_COMM_WORLD);
                 cudaDeviceSynchronize();
-                cudaMemcpy(ctx->validate_buf, ctx->rxbuf+(i*align_to(ctx->buf_size + 40, page_size)), ctx->buf_size, cudaMemcpyDefault);
+                cudaMemcpy(ctx->validate_buf, ctx->rxbuf+(i*ctx->size_align), ctx->buf_size, cudaMemcpyDefault);
                 char *value = (char*)ctx->validate_buf;
                 char expected=i%CHAR_MAX;
 
@@ -1198,8 +1193,6 @@ static void usage(const char *argv0)
         printf("  -E, --gpu-mem             allocate GPU intead of CPU memory buffers\n");
         printf("  -K, --skip-kernel-launch  no GPU kernel computations, only communications\n");
         printf("  -I, --send-info           modify send info after CPU posting\n");
-
-
 }
 
 int main(int argc, char *argv[])
@@ -1540,11 +1533,11 @@ int main(int argc, char *argv[])
         if (!ctx)
                 return 1;
 
-        int nrecv = pp_post_recv(ctx, max_batch_len);
-        if (nrecv < max_batch_len) {
-                gpu_warn("[%d] Could not post all receive, requested %d, actually posted %d\n", my_rank, max_batch_len, nrecv);
-                return 1;
-        }
+//        int nrecv = pp_post_recv(ctx, max_batch_len);
+//        if (nrecv < max_batch_len) {
+//                gpu_warn("[%d] Could not post all receive, requested %d, actually posted %d\n", my_rank, max_batch_len, nrecv);
+//                return 1;
+//        }
 
         if (pp_get_port_info(ctx->context, ib_port, &ctx->portinfo)) {
                 gpu_err("[%d] Couldn't get port info\n", my_rank);
@@ -1684,11 +1677,6 @@ int main(int argc, char *argv[])
                 }
         }
 
-        if( ctx->exp_send_info == 1 )
-        {
-
-        }
-
         float pre_post_us = 0;
 
         {