diff --git a/example/gpu/perthreadRuntimeDist/.gitignore b/example/gpu/perthreadRuntimeDist/.gitignore new file mode 100644 index 000000000..dda11ef6e --- /dev/null +++ b/example/gpu/perthreadRuntimeDist/.gitignore @@ -0,0 +1,2 @@ +/perthreadRuntimeDist +/.output \ No newline at end of file diff --git a/example/gpu/perthreadRuntimeDist/Makefile b/example/gpu/perthreadRuntimeDist/Makefile new file mode 100644 index 000000000..9d6848501 --- /dev/null +++ b/example/gpu/perthreadRuntimeDist/Makefile @@ -0,0 +1,145 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +OUTPUT := .output +CLANG ?= clang +LIBBPF_SRC := $(abspath ../../../third_party/libbpf/src) +BPFTOOL_SRC := $(abspath ../../../third_party/bpftool/src) +LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a) +BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool) +BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool +ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ + | sed 's/arm.*/arm/' \ + | sed 's/aarch64/arm64/' \ + | sed 's/ppc64le/powerpc/' \ + | sed 's/mips.*/mips/' \ + | sed 's/riscv64/riscv/' \ + | sed 's/loongarch64/loongarch/') +VMLINUX := ../../../third_party/vmlinux/$(ARCH)/vmlinux.h +# Use our own libbpf API headers and Linux UAPI headers distributed with +# libbpf to avoid dependency on system-wide headers, which could be missing or +# outdated +INCLUDES := -I$(OUTPUT) -I../../../third_party/libbpf/include/uapi -I$(dir $(VMLINUX)) +CFLAGS := -g -Wall +ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS) + +APPS = perthreadRuntimeDist # minimal minimal_legacy uprobe kprobe fentry usdt sockfilter tc ksyscall + +CARGO ?= $(shell which cargo) +ifeq ($(strip $(CARGO)),) +BZS_APPS := +else +BZS_APPS := # profile +APPS += $(BZS_APPS) +# Required by libblazesym +ALL_LDFLAGS += -lrt -ldl -lpthread -lm +endif + +# Get Clang's default includes on this system. We'll explicitly add these dirs +# to the includes list when compiling with `-target bpf` because otherwise some +# architecture-specific dirs will be "missing" on some architectures/distros - +# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h, +# sys/cdefs.h etc. might be missing. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') + +ifeq ($(V),1) + Q = + msg = +else + Q = @ + msg = @printf ' %-8s %s%s\n' \ + "$(1)" \ + "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \ + "$(if $(3), $(3))"; + MAKEFLAGS += --no-print-directory +endif + +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +$(call allow-override,CC,$(CROSS_COMPILE)cc) +$(call allow-override,LD,$(CROSS_COMPILE)ld) + +.PHONY: all +all: $(APPS) vec_add + +vec_add: vec_add.cu + @if command -v nvcc >/dev/null 2>&1; then \ + nvcc -arch=sm_61 -cudart shared vec_add.cu -o vec_add -g; \ + else \ + echo "Warning: CUDA not found, skipping vec_add build"; \ + fi + +.PHONY: clean +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) $(APPS) vec_add + +$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +# Build libbpf +$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf + $(call msg,LIB,$@) + $(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \ + OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \ + INCLUDEDIR= LIBDIR= UAPIDIR= \ + install + +# Build bpftool +$(BPFTOOL): | $(BPFTOOL_OUTPUT) + $(call msg,BPFTOOL,$@) + $(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap + + +$(LIBBLAZESYM_SRC)/target/release/libblazesym.a:: + $(Q)cd $(LIBBLAZESYM_SRC) && $(CARGO) build --features=cheader,dont-generate-test-files --release + +$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT) + $(call msg,LIB, $@) + $(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym.a $@ + +$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT) + $(call msg,LIB,$@) + $(Q)cp $(LIBBLAZESYM_SRC)/target/release/blazesym.h $@ + +# Build BPF code +$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL) + $(call msg,BPF,$@) + $(Q)$(CLANG) -Xlinker --export-dynamic -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \ + $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \ + -c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + $(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + +# Generate BPF skeletons +$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton $< > $@ + +# Build user-space code +$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h + +$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT) + $(call msg,CC,$@) + $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ + +$(patsubst %,$(OUTPUT)/%.o,$(BZS_APPS)): $(LIBBLAZESYM_HEADER) + +$(BZS_APPS): $(LIBBLAZESYM_OBJ) + +# Build application binary +$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT) + $(call msg,BINARY,$@) + $(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@ + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.skel.h, .bpf.o, etc) targets +.SECONDARY: diff --git a/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.bpf.c b/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.bpf.c new file mode 100644 index 000000000..a6be3fee6 --- /dev/null +++ b/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.bpf.c @@ -0,0 +1,62 @@ +#include "vmlinux.h" +#include +#include +#include "perthreadRuntimeDist.h" + + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} events SEC(".maps"); + +// 用于记录 start time +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 8192); + __type(key, u32); + __type(value, u64); +} start SEC(".maps"); + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +// EXT helper index — 必须存在且可见 +static const u64 (*bpf_get_globaltimer)(void) = (void *)502; + +// GPU kernel entry +SEC("kprobe/cudaLaunchKernel") +int cuda__kernel_entry(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + + u64 start_cycles = bpf_get_globaltimer(); // bpf_ktime_get_ns()? + + bpf_map_update_elem(&start, &tid, &start_cycles, BPF_ANY); + + return 0; +} + +SEC("kretprobe/cudaLaunchKernel") +int cuda__kernel_exit(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + u64 *start_cycles = bpf_map_lookup_elem(&start, &tid); + + if (!start_cycles) + return 0; + + u64 end_cycles = bpf_get_globaltimer(); + + struct event_t evt = { + .tid = tid, + .cycles = end_cycles - *start_cycles, + }; + + bpf_perf_event_output(ctx, &events, + BPF_F_CURRENT_CPU, + &evt, + sizeof(evt)); + + bpf_map_delete_elem(&start, &tid); + return 0; +} diff --git a/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.c b/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.c new file mode 100644 index 000000000..9d2d8c52c --- /dev/null +++ b/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.c @@ -0,0 +1,61 @@ +#include +#include +#include +#include "perthreadRuntimeDist.skel.h" +#include "perthreadRuntimeDist.h" + +static void handle_event(void *ctx, int cpu, void *data, __u32 size) +{ + struct event_t *e = data; + printf("[CPU %d] tid=%u cycles=%llu ns\n", + cpu, e->tid, e->cycles); +} + +static void handle_lost(void *ctx, int cpu, __u64 lost) +{ + printf("LOST %llu events on CPU %d\n", lost, cpu); +} + +int main() +{ + struct perthreadRuntimeDist_bpf *skel; + struct perf_buffer *pb; + int events_fd; + + skel = perthreadRuntimeDist_bpf__open(); + if (!skel) { + fprintf(stderr, "Failed to open BPF skeleton\n"); + return 1; + } + + if (perthreadRuntimeDist_bpf__load(skel)) { + fprintf(stderr, "Failed to load BPF skeleton\n"); + return 1; + } + + if (perthreadRuntimeDist_bpf__attach(skel)) { + fprintf(stderr, "Failed to attach BPF skeleton\n"); + return 1; + } + + printf("BPF attached successfully\n"); + + events_fd = bpf_map__fd(skel->maps.events); + pb = perf_buffer__new(events_fd, 16 /*buffer pages*/, + handle_event, handle_lost, NULL, NULL); + + if (!pb) { + fprintf(stderr, "Failed to open perf buffer\n"); + return 1; + } + + printf("Collecting data...\n"); + + while (1) { + int err = perf_buffer__poll(pb, 100 /*ms*/); + if (err < 0) + printf("perf_buffer__poll() error %d\n", err); + } + + return 0; +} diff --git a/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.h b/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.h new file mode 100644 index 000000000..fec960828 --- /dev/null +++ b/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.h @@ -0,0 +1,9 @@ +#ifndef __PERTHREAD_RUNTIME_DIST_H__ +#define __PERTHREAD_RUNTIME_DIST_H__ + +struct event_t { + __u32 tid; + __u64 cycles; +}; + +#endif diff --git a/example/gpu/perthreadRuntimeDist/thread_time_distribution_demo.cu b/example/gpu/perthreadRuntimeDist/thread_time_distribution_demo.cu new file mode 100644 index 000000000..e988a80c8 --- /dev/null +++ b/example/gpu/perthreadRuntimeDist/thread_time_distribution_demo.cu @@ -0,0 +1,147 @@ +// thread_time_distribution_demo.cu +#include +#include +#include + +// Simple CUDA error checking +#define CHECK_CUDA(call) \ + do { \ + cudaError_t err__ = (call); \ + if (err__ != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s (%d) at %s:%d\n", \ + cudaGetErrorString(err__), err__, __FILE__, __LINE__); \ + std::exit(EXIT_FAILURE); \ + } \ + } while (0) + +// Per-thread timing info +struct ThreadTime { + unsigned int blockId; // block index (1D grid) + unsigned int threadId; // global thread ID (1D) + unsigned long long cycles; // cycles spent in work section +}; + +// Kernel: each thread runs a synthetic workload, and we measure its time +__global__ void timed_work_kernel(ThreadTime* out, int base_iters) +{ + const unsigned int blockId = blockIdx.x; + const unsigned int threadIdInBlock = threadIdx.x; + const unsigned int globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + + // Each thread runs a slightly different number of iterations + // to produce a non-trivial distribution. + int my_iters = base_iters + (globalThreadId & 0x1F); // vary per thread + + // Time the "work" section using clock64() + unsigned long long start = clock64(); + + volatile float acc = 0.0f; + for (int i = 0; i < my_iters; ++i) { + acc += 1.0f; // trivial arithmetic + } + + unsigned long long end = clock64(); + + // Prevent the compiler from optimizing out the loop completely + if (acc == -1.0f) { + printf("This will never be printed\n"); + } + + ThreadTime t; + t.blockId = blockId; + t.threadId = globalThreadId; + t.cycles = end - start; + + out[globalThreadId] = t; +} + +int main() +{ + // Again, keep it small for printing + const int BLOCKS = 4; + const int THREADS_PER_BLOCK = 64; + const int TOTAL_THREADS = BLOCKS * THREADS_PER_BLOCK; + + const int BASE_ITERS = 100000; // base workload per thread + + printf("=== Per-thread runtime distribution demo ===\n"); + printf("Grid: %d blocks, Block: %d threads (total %d threads)\n", + BLOCKS, THREADS_PER_BLOCK, TOTAL_THREADS); + + // Allocate device and host buffers + ThreadTime* d_times = nullptr; + CHECK_CUDA(cudaMalloc(&d_times, TOTAL_THREADS * sizeof(ThreadTime))); + + ThreadTime* h_times = (ThreadTime*)std::malloc(TOTAL_THREADS * sizeof(ThreadTime)); + if (!h_times) { + fprintf(stderr, "Host malloc failed\n"); + std::exit(EXIT_FAILURE); + } + + // Launch kernel + dim3 grid(BLOCKS); + dim3 block(THREADS_PER_BLOCK); + + timed_work_kernel<<>>(d_times, BASE_ITERS); + CHECK_CUDA(cudaGetLastError()); + CHECK_CUDA(cudaDeviceSynchronize()); + + // Copy results back + CHECK_CUDA(cudaMemcpy(h_times, d_times, + TOTAL_THREADS * sizeof(ThreadTime), + cudaMemcpyDeviceToHost)); + + // Get device clock rate to convert cycles -> time + cudaDeviceProp prop; + CHECK_CUDA(cudaGetDeviceProperties(&prop, 0)); + + // clockRate: kHz (cycles per millisecond) + const double sm_freq_hz = (double)prop.clockRate * 1000.0; + + printf("\nDevice: %s\n", prop.name); + printf("SM clock rate: %.3f MHz\n\n", sm_freq_hz / 1.0e6); + + // Print per-thread timing + printf("Per-thread timing:\n"); + printf("tid block cycles time_us\n"); + printf("--------------------------------------\n"); + + unsigned long long min_cycles = ~0ULL; + unsigned long long max_cycles = 0; + unsigned long long sum_cycles = 0; + + for (int i = 0; i < TOTAL_THREADS; ++i) { + const ThreadTime& t = h_times[i]; + + double time_sec = (double)t.cycles / sm_freq_hz; + double time_us = time_sec * 1.0e6; + + printf("%3u %5u %10llu %10.3f\n", + t.threadId, t.blockId, + (unsigned long long)t.cycles, time_us); + + if (t.cycles < min_cycles) min_cycles = t.cycles; + if (t.cycles > max_cycles) max_cycles = t.cycles; + sum_cycles += t.cycles; + } + + double avg_cycles = (double)sum_cycles / TOTAL_THREADS; + double min_us = (double)min_cycles / sm_freq_hz * 1.0e6; + double max_us = (double)max_cycles / sm_freq_hz * 1.0e6; + double avg_us = avg_cycles / sm_freq_hz * 1.0e6; + + printf("\nSummary over all threads:\n"); + printf(" min cycles = %llu (%.3f us)\n", + (unsigned long long)min_cycles, min_us); + printf(" max cycles = %llu (%.3f us)\n", + (unsigned long long)max_cycles, max_us); + printf(" avg cycles = %.1f (%.3f us)\n", + avg_cycles, avg_us); + + // Cleanup + CHECK_CUDA(cudaFree(d_times)); + std::free(h_times); + + return 0; +} \ No newline at end of file diff --git a/example/gpu/perthreadRuntimeDist/vec_add b/example/gpu/perthreadRuntimeDist/vec_add new file mode 100755 index 000000000..681c868ff Binary files /dev/null and b/example/gpu/perthreadRuntimeDist/vec_add differ diff --git a/example/gpu/perthreadRuntimeDist/vec_add.cu b/example/gpu/perthreadRuntimeDist/vec_add.cu new file mode 100644 index 000000000..4413cb6ff --- /dev/null +++ b/example/gpu/perthreadRuntimeDist/vec_add.cu @@ -0,0 +1,85 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* +nvcc -x cu -cuda vectorAdd.cu -o vectorAdd.cpp +python filter_hashtag.py +g++ vectorAdd-new.cpp -Wall -L /usr/local/cuda-12.6/lib64 -lcudart -o vectorAdd +-g + */ + +__constant__ int d_N; + +// A simple vector addition kernel +__global__ void vectorAdd(const float *A, const float *B, float *C) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + // Grid-stride loop to handle cases where total threads < N + while (idx < d_N) { + C[idx] = A[idx] + B[idx]; + idx += blockDim.x * gridDim.x; // Grid stride + } +} + +int main() +{ + // Set vector size in constant memory + const int h_N = 1 << 20; // 1M elements + cudaMemcpyToSymbol(d_N, &h_N, sizeof(h_N)); + + size_t bytes = h_N * sizeof(float); + + // Allocate and initialize host memory using vectors + std::vector h_A(h_N), h_B(h_N), h_C(h_N); + + for (int i = 0; i < h_N; ++i) { + h_A[i] = float(i); + h_B[i] = float(2 * i); + } + + // Allocate Device memory + float *d_A, *d_B, *d_C; + cudaMalloc(&d_A, bytes); + cudaMalloc(&d_B, bytes); + cudaMalloc(&d_C, bytes); + + // Copy to device + cudaMemcpy(d_A, h_A.data(), bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_B, h_B.data(), bytes, cudaMemcpyHostToDevice); + + // Run the kernel in an infinite loop + while (true) { + // Zero output array + cudaMemset(d_C, 0, bytes); + + // Launch kernel + vectorAdd<<>>(d_A, d_B, d_C); + cudaDeviceSynchronize(); + // Copy result back to host + cudaMemcpy(h_C.data(), d_C, bytes, cudaMemcpyDeviceToHost); + + // Print first element as a check + std::cout << "C[0] = " << h_C[0] << " (expected 0)\n"; + std::cout << "C[1] = " << h_C[1] << " (expected 3)\n"; + std::cout << "C[2] = " << h_C[2] << " (expected 6)\n"; + + // Sleep for 1 second + sleep(1); + } + + // Note: This code will never reach cleanup due to infinite loop + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + + return 0; +}