Skip to content

Commit cb95d73

Browse files
committed
Add support for Host-backed GPU maps
- Introduced two new map types: BPF_MAP_TYPE_PERGPUTD_ARRAY_HOST_MAP and BPF_MAP_TYPE_GPU_ARRAY_HOST_MAP for Tegra platforms without CUDA IPC. - Updated default_trampoline.cu to handle new host-based map types in the BPF helper functions. - Created host_map_test.bpf.c and host_map_test.c to demonstrate usage of the new host-backed maps, including per-thread and shared storage. - Enhanced the build system with a Makefile and README for the new example, detailing usage and requirements. This change improves memory management and flexibility for applications running on platforms lacking CUDA IPC support, enabling efficient data sharing between CPU and GPU. Test Method and Cases Prerequisites Test Case 1: Basic Functionality Purpose: Verify both map types work correctly Expected Result: shared_counter shows values for keys 0-9 (threads mod 10) perthread_counter shows per-thread call counts, execution times, and thread IDs thread_timestamp shows active thread count Signed-off-by: jingxuanxie <[email protected]>
1 parent f9a1ebd commit cb95d73

16 files changed

+1396
-7
lines changed

attach/nv_attach_impl/trampoline/default_trampoline.cu

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,11 +83,16 @@ struct CommSharedMem {
8383
uint64_t time_sum[8];
8484
};
8585

86+
// IPC-based GPU maps (for x86 with CUDA IPC support)
8687
const int BPF_MAP_TYPE_PERGPUTD_ARRAY_MAP = 1502;
8788
const int BPF_MAP_TYPE_GPU_ARRAY_MAP = 1503; // non-per-thread, single-copy
8889
// shared array
8990
const int BPF_MAP_TYPE_GPU_RINGBUF_MAP = 1527;
9091

92+
// HOST-based GPU maps (for Tegra/platforms without CUDA IPC)
93+
const int BPF_MAP_TYPE_PERGPUTD_ARRAY_HOST_MAP = 1512;
94+
const int BPF_MAP_TYPE_GPU_ARRAY_HOST_MAP = 1513;
95+
9196
struct MapBasicInfo {
9297
bool enabled;
9398
int key_size;
@@ -198,6 +203,7 @@ extern "C" __noinline__ __device__ uint64_t _bpf_helper_ext_0001(
198203
auto &req = global_data->req;
199204
// CallRequest req;
200205
const auto &map_info = ::map_info[map];
206+
// IPC-based per-thread array map
201207
if (map_info.map_type == BPF_MAP_TYPE_PERGPUTD_ARRAY_MAP) {
202208
auto real_key = *(uint32_t *)(uintptr_t)key;
203209
auto offset = array_map_offset(real_key, map_info, map);
@@ -212,6 +218,20 @@ extern "C" __noinline__ __device__ uint64_t _bpf_helper_ext_0001(
212218
(uint64_t)real_key *
213219
map_info.value_size);
214220
}
221+
// HOST-based per-thread array map (for Tegra)
222+
if (map_info.map_type == BPF_MAP_TYPE_PERGPUTD_ARRAY_HOST_MAP) {
223+
auto real_key = *(uint32_t *)(uintptr_t)key;
224+
auto offset = array_map_offset(real_key, map_info, map);
225+
asm("membar.sys;"); // Ensure CPU writes are visible to GPU
226+
return (uint64_t)offset;
227+
}
228+
// HOST-based non-per-thread GPU array map (for Tegra)
229+
if (map_info.map_type == BPF_MAP_TYPE_GPU_ARRAY_HOST_MAP) {
230+
auto real_key = *(uint32_t *)(uintptr_t)key;
231+
auto base = (char *)map_info.extra_buffer;
232+
asm("membar.sys;"); // Ensure CPU writes are visible to GPU
233+
return (uint64_t)(uintptr_t)(base + (uint64_t)real_key * map_info.value_size);
234+
}
215235
// printf("helper1 map %ld keysize=%d valuesize=%d\n", map,
216236
// map_info.key_size, map_info.value_size);
217237
simple_memcpy(&req.map_lookup.key, (void *)(uintptr_t)key,
@@ -229,6 +249,7 @@ extern "C" __noinline__ __device__ uint64_t _bpf_helper_ext_0002(
229249
CommSharedMem *global_data = (CommSharedMem *)constData;
230250
auto &req = global_data->req;
231251
const auto &map_info = ::map_info[map];
252+
// IPC-based per-thread array map
232253
if (map_info.map_type == BPF_MAP_TYPE_PERGPUTD_ARRAY_MAP) {
233254
auto real_key = *(uint32_t *)(uintptr_t)key;
234255
auto offset = array_map_offset(real_key, map_info, map);
@@ -249,6 +270,24 @@ extern "C" __noinline__ __device__ uint64_t _bpf_helper_ext_0002(
249270
asm("membar.sys; \n\t");
250271
return 0;
251272
}
273+
// HOST-based per-thread array map (for Tegra)
274+
if (map_info.map_type == BPF_MAP_TYPE_PERGPUTD_ARRAY_HOST_MAP) {
275+
auto real_key = *(uint32_t *)(uintptr_t)key;
276+
auto offset = array_map_offset(real_key, map_info, map);
277+
simple_memcpy(offset, (void *)(uintptr_t)value,
278+
map_info.value_size);
279+
asm("membar.sys;"); // Ensure GPU writes are visible to CPU
280+
return 0;
281+
}
282+
// HOST-based non-per-thread GPU array map (for Tegra)
283+
if (map_info.map_type == BPF_MAP_TYPE_GPU_ARRAY_HOST_MAP) {
284+
auto real_key = *(uint32_t *)(uintptr_t)key;
285+
auto base = (char *)map_info.extra_buffer;
286+
auto dst = (void *)(uintptr_t)(base + (uint64_t)real_key * map_info.value_size);
287+
simple_memcpy(dst, (void *)(uintptr_t)value, map_info.value_size);
288+
asm("membar.sys;"); // Ensure GPU writes are visible to CPU
289+
return 0;
290+
}
252291
// printf("helper2 map %ld keysize=%d
253292
// valuesize=%d\n",map,map_info.key_size,map_info.value_size);
254293
simple_memcpy(&req.map_update.key, (void *)(uintptr_t)key,

example/gpu/host_map_test/Makefile

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
2+
OUTPUT := .output
3+
CLANG ?= clang
4+
LIBBPF_SRC := $(abspath ../../../third_party/libbpf/src)
5+
BPFTOOL_SRC := $(abspath ../../../third_party/bpftool/src)
6+
LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a)
7+
BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool)
8+
BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool
9+
ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
10+
| sed 's/arm.*/arm/' \
11+
| sed 's/aarch64/arm64/' \
12+
| sed 's/ppc64le/powerpc/' \
13+
| sed 's/mips.*/mips/' \
14+
| sed 's/riscv64/riscv/' \
15+
| sed 's/loongarch64/loongarch/')
16+
VMLINUX := ../../../third_party/vmlinux/$(ARCH)/vmlinux.h
17+
# Use our own libbpf API headers and Linux UAPI headers distributed with
18+
# libbpf to avoid dependency on system-wide headers, which could be missing or
19+
# outdated
20+
INCLUDES := -I$(OUTPUT) -I../../../third_party/libbpf/include/uapi -I$(dir $(VMLINUX))
21+
CFLAGS := -g -Wall
22+
ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS)
23+
24+
APPS = host_map_test
25+
26+
CARGO ?= $(shell which cargo)
27+
ifeq ($(strip $(CARGO)),)
28+
BZS_APPS :=
29+
else
30+
BZS_APPS :=
31+
APPS += $(BZS_APPS)
32+
ALL_LDFLAGS += -lrt -ldl -lpthread -lm
33+
endif
34+
35+
CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - </dev/null 2>&1 \
36+
| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
37+
38+
ifeq ($(V),1)
39+
Q =
40+
msg =
41+
else
42+
Q = @
43+
msg = @printf ' %-8s %s%s\n' \
44+
"$(1)" \
45+
"$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \
46+
"$(if $(3), $(3))";
47+
MAKEFLAGS += --no-print-directory
48+
endif
49+
50+
define allow-override
51+
$(if $(or $(findstring environment,$(origin $(1))),\
52+
$(findstring command line,$(origin $(1)))),,\
53+
$(eval $(1) = $(2)))
54+
endef
55+
56+
$(call allow-override,CC,$(CROSS_COMPILE)cc)
57+
$(call allow-override,LD,$(CROSS_COMPILE)ld)
58+
59+
.PHONY: all
60+
all: $(APPS) vec_add
61+
62+
vec_add: vec_add.cu
63+
@if command -v nvcc >/dev/null 2>&1; then \
64+
echo " NVCC vec_add"; \
65+
nvcc -cudart shared vec_add.cu -o vec_add -g; \
66+
else \
67+
echo "Warning: CUDA not found, skipping vec_add build"; \
68+
fi
69+
70+
.PHONY: clean
71+
clean:
72+
$(call msg,CLEAN)
73+
$(Q)rm -rf $(OUTPUT) $(APPS) vec_add
74+
75+
$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT):
76+
$(call msg,MKDIR,$@)
77+
$(Q)mkdir -p $@
78+
79+
# Build libbpf
80+
$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
81+
$(call msg,LIB,$@)
82+
$(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \
83+
OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \
84+
INCLUDEDIR= LIBDIR= UAPIDIR= \
85+
install
86+
87+
# Build bpftool
88+
$(BPFTOOL): | $(BPFTOOL_OUTPUT)
89+
$(call msg,BPFTOOL,$@)
90+
$(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap
91+
92+
# Build BPF code
93+
$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL)
94+
$(call msg,BPF,$@)
95+
$(Q)$(CLANG) -Xlinker --export-dynamic -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \
96+
$(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \
97+
-c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
98+
$(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
99+
100+
# Generate BPF skeletons
101+
$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL)
102+
$(call msg,GEN-SKEL,$@)
103+
$(Q)$(BPFTOOL) gen skeleton $< > $@
104+
105+
# Build user-space code
106+
$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h
107+
108+
$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT)
109+
$(call msg,CC,$@)
110+
$(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@
111+
112+
# Build application binary
113+
$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
114+
$(call msg,BINARY,$@)
115+
$(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@
116+
117+
# delete failed targets
118+
.DELETE_ON_ERROR:
119+
120+
# keep intermediate (.skel.h, .bpf.o, etc) targets
121+
.SECONDARY:

0 commit comments

Comments
 (0)