Skip to content

Commit

Permalink
refs #6: Try physically contiguous memory as GPU IO buffer, but failed.
Browse files Browse the repository at this point in the history
 * Uses DPDK's memzone instead of CUDA's portable buffers.

 * Just keep it as an optional feature.
   Default is not to use it.
  • Loading branch information
achimnol committed Jan 14, 2016
1 parent 6612281 commit d81c5f9
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 22 deletions.
5 changes: 5 additions & 0 deletions include/nba/engines/cuda/computecontext.hh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#include <nba/engines/cuda/mempool.hh>
#include <nba/engines/cuda/utils.hh>

struct rte_memzone;

#define CUDA_MAX_KERNEL_ARGS (16)

namespace nba
Expand Down Expand Up @@ -92,6 +94,9 @@ private:
CPUMemoryPool _cpu_mempool_in[NBA_MAX_IO_BASES];
CPUMemoryPool _cpu_mempool_out[NBA_MAX_IO_BASES];

const struct rte_memzone *reserve_memory(ComputeDevice *mother);
const struct rte_memzone *mz;

void *dummy_host_buf;
memory_t dummy_dev_buf;

Expand Down
16 changes: 11 additions & 5 deletions include/nba/engines/cuda/mempool.hh
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ private:
class CPUMemoryPool : public MemoryPool
{
public:
CPUMemoryPool(int cuda_flags = 0) : MemoryPool(), base(NULL), flags(cuda_flags)
CPUMemoryPool(int cuda_flags = 0) : MemoryPool(), base(NULL), flags(cuda_flags), use_external(false)
{
}

Expand All @@ -72,11 +72,16 @@ public:
return true;
}

bool init_with_flags(unsigned long size, int flags)
bool init_with_flags(unsigned long size, void *ext_ptr, int flags)
{
this->max_size = size;
cutilSafeCall(cudaHostAlloc((void **) &base, size,
flags));
if (ext_ptr != nullptr) {
base = ext_ptr;
use_external = true;
} else {
cutilSafeCall(cudaHostAlloc((void **) &base, size,
flags));
}
return true;
}

Expand All @@ -91,7 +96,7 @@ public:

void destroy()
{
if (base != NULL)
if (base != NULL && !use_external)
cudaFreeHost(base);
}

Expand All @@ -103,6 +108,7 @@ public:
protected:
void *base;
int flags;
bool use_external;
};

}
Expand Down
2 changes: 1 addition & 1 deletion include/nba/framework/computedevice.hh
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,10 @@ public:
struct ev_async *input_watcher;
AsyncSemaphore available_sema;

protected:
const unsigned node_id;
const unsigned device_id;
const size_t num_contexts;
protected:
std::vector<ComputeContext *> contexts;
Lock _lock;

Expand Down
47 changes: 39 additions & 8 deletions src/engines/cuda/computecontext.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include <nba/core/intrinsic.hh>
#include <nba/engines/cuda/computecontext.hh>
#include <rte_memzone.h>
#include <unistd.h>

using namespace std;
using namespace nba;
Expand All @@ -10,27 +12,38 @@ struct cuda_event_context {
void *user_arg;
};

CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother_device)
: ComputeContext(ctx_id, mother_device), checkbits_d(NULL), checkbits_h(NULL),
num_kernel_args(0)
#define IO_BASE_SIZE (4 * 1024 * 1024)
#undef USE_PHYS_CONT_MEMORY // performance degraded :(

CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother)
: ComputeContext(ctx_id, mother), checkbits_d(NULL), checkbits_h(NULL),
mz(reserve_memory(mother)), num_kernel_args(0)
/* NOTE: Write-combined memory degrades performance to half... */
{
type_name = "cuda";
size_t io_base_size = 4 * 1024 * 1024; // TODO: read from config
size_t io_base_size = ALIGN_CEIL(IO_BASE_SIZE, getpagesize()); // TODO: read from config
cutilSafeCall(cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking));
io_base_ring.init(NBA_MAX_IO_BASES, node_id, io_base_ring_buf);
for (unsigned i = 0; i < NBA_MAX_IO_BASES; i++) {
io_base_ring.push_back(i);
_cuda_mempool_in[i].init(io_base_size);
_cuda_mempool_out[i].init(io_base_size);
_cpu_mempool_in[i].init_with_flags(io_base_size, cudaHostAllocPortable);
_cpu_mempool_out[i].init_with_flags(io_base_size, cudaHostAllocPortable);
#ifdef USE_PHYS_CONT_MEMORY
void *base;
base = (void *) ((uintptr_t) mz->addr + i * io_base_size);
_cpu_mempool_in[i].init_with_flags(io_base_size, base, 0);
base = (void *) ((uintptr_t) mz->addr + i * io_base_size + NBA_MAX_IO_BASES * io_base_size);
_cpu_mempool_out[i].init_with_flags(io_base_size, base, 0);
#else
_cpu_mempool_in[i].init_with_flags(io_base_size, nullptr, cudaHostAllocPortable);
_cpu_mempool_out[i].init_with_flags(io_base_size, nullptr, cudaHostAllocPortable);
#endif
}
{
void *t;
cutilSafeCall(cudaMalloc((void **) &t, 64));
cutilSafeCall(cudaMalloc((void **) &t, CACHE_LINE_SIZE));
dummy_dev_buf.ptr = t;
cutilSafeCall(cudaHostAlloc((void **) &t, 64, cudaHostAllocPortable));
cutilSafeCall(cudaHostAlloc((void **) &t, CACHE_LINE_SIZE, cudaHostAllocPortable));
dummy_host_buf = t;
}
cutilSafeCall(cudaHostAlloc((void **) &checkbits_h, MAX_BLOCKS, cudaHostAllocMapped));
Expand All @@ -40,6 +53,22 @@ CUDAComputeContext::CUDAComputeContext(unsigned ctx_id, ComputeDevice *mother_de
memset(checkbits_h, 0, MAX_BLOCKS);
}

const struct rte_memzone *CUDAComputeContext::reserve_memory(ComputeDevice *mother)
{
#ifdef USE_PHYS_CONT_MEMORY
char namebuf[RTE_MEMZONE_NAMESIZE];
size_t io_base_size = ALIGN_CEIL(IO_BASE_SIZE, getpagesize());
snprintf(namebuf, RTE_MEMZONE_NAMESIZE, "cuda.io.%d:%d", mother->device_id, ctx_id);
const struct rte_memzone *_mz = rte_memzone_reserve(namebuf, 2 * io_base_size * NBA_MAX_IO_BASES,
mother->node_id,
RTE_MEMZONE_2MB | RTE_MEMZONE_SIZE_HINT_ONLY);
assert(_mz != nullptr);
return _mz;
#else
return nullptr;
#endif
}

CUDAComputeContext::~CUDAComputeContext()
{
cutilSafeCall(cudaStreamDestroy(_stream));
Expand All @@ -49,6 +78,8 @@ CUDAComputeContext::~CUDAComputeContext()
_cpu_mempool_in[i].destroy();
_cpu_mempool_out[i].destroy();
}
if (mz != nullptr)
rte_memzone_free(mz);
cutilSafeCall(cudaFreeHost(checkbits_h));
}

Expand Down
16 changes: 8 additions & 8 deletions src/lib/io.cc
Original file line number Diff line number Diff line change
Expand Up @@ -813,9 +813,9 @@ int io_loop(void *arg)
/* The IO thread runs in polling mode. */
while (likely(!ctx->loop_broken)) {
unsigned total_recv_cnt = 0;
#ifdef NBA_CPU_MICROBENCH
#ifdef NBA_CPU_MICROBENCH/*{{{*/
PAPI_start(ctx->papi_evset_rx);
#endif
#endif/*}}}*/
for (i = 0; i < ctx->num_hw_rx_queues; i++) {
#ifdef NBA_RANDOM_PORT_ACCESS /*{{{*/
/* Shuffle the RX queue list. */
Expand Down Expand Up @@ -939,14 +939,14 @@ int io_loop(void *arg)

} // end of rxq scanning
assert(total_recv_cnt <= NBA_MAX_IO_BATCH_SIZE * ctx->num_hw_rx_queues);
#ifdef NBA_CPU_MICROBENCH
#ifdef NBA_CPU_MICROBENCH/*{{{*/
{
long long ctr[5];
PAPI_stop(ctx->papi_evset_rx, ctr);
for (int i = 0; i < 5; i++)
ctx->papi_ctr_rx[i] += ctr[i];
}
#endif
#endif/*}}}*/

if (ctx->mode == IO_EMUL) {/*{{{*/
while (!rte_ring_empty(ctx->drop_queue)) {
Expand All @@ -970,14 +970,14 @@ int io_loop(void *arg)
/* Scan and execute schedulable elements. */
ctx->comp_ctx->elem_graph->scan_schedulable_elements(loop_count);

#ifdef NBA_CPU_MICROBENCH
#ifdef NBA_CPU_MICROBENCH/*{{{*/
{
long long ctr[5];
PAPI_stop(ctx->papi_evset_comp, ctr);
for (int i = 0; i < 5; i++)
ctx->papi_ctr_comp[i] += ctr[i];
}
#endif
#endif/*}}}*/

while (!rte_ring_empty(ctx->new_packet_request_ring))/*{{{*/
{
Expand Down Expand Up @@ -1013,9 +1013,9 @@ int io_loop(void *arg)

/* Process received packets. */
print_ratelimit("# received pkts from all rxq", total_recv_cnt, 10000);
#ifdef NBA_CPU_MICROBENCH
#ifdef NBA_CPU_MICROBENCH/*{{{*/
PAPI_start(ctx->papi_evset_comp);
#endif
#endif/*}}}*/
unsigned comp_batch_size = ctx->comp_ctx->num_combatch_size;
for (unsigned pidx = 0; pidx < total_recv_cnt; pidx += comp_batch_size) {
comp_process_batch(ctx, &pkts[pidx], RTE_MIN(comp_batch_size, total_recv_cnt - pidx), loop_count);
Expand Down

0 comments on commit d81c5f9

Please sign in to comment.