diff --git a/example/gpu/perthreadRuntimeDist/.gitignore b/example/gpu/perthreadRuntimeDist/.gitignore
new file mode 100644
index 000000000..dda11ef6e
--- /dev/null
+++ b/example/gpu/perthreadRuntimeDist/.gitignore
@@ -0,0 +1,2 @@
+/perthreadRuntimeDist
+/.output
\ No newline at end of file
diff --git a/example/gpu/perthreadRuntimeDist/Makefile b/example/gpu/perthreadRuntimeDist/Makefile
new file mode 100644
index 000000000..9d6848501
--- /dev/null
+++ b/example/gpu/perthreadRuntimeDist/Makefile
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+OUTPUT := .output
+CLANG ?= clang
+LIBBPF_SRC := $(abspath ../../../third_party/libbpf/src)
+BPFTOOL_SRC := $(abspath ../../../third_party/bpftool/src)
+LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a)
+BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool)
+BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool
+ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
+			 | sed 's/arm.*/arm/' \
+			 | sed 's/aarch64/arm64/' \
+			 | sed 's/ppc64le/powerpc/' \
+			 | sed 's/mips.*/mips/' \
+			 | sed 's/riscv64/riscv/' \
+			 | sed 's/loongarch64/loongarch/')
+VMLINUX := ../../../third_party/vmlinux/$(ARCH)/vmlinux.h
+# Use our own libbpf API headers and Linux UAPI headers distributed with
+# libbpf to avoid dependency on system-wide headers, which could be missing or
+# outdated
+INCLUDES := -I$(OUTPUT) -I../../../third_party/libbpf/include/uapi -I$(dir $(VMLINUX))
+CFLAGS := -g -Wall
+ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS)
+
+APPS = perthreadRuntimeDist # minimal minimal_legacy uprobe kprobe fentry usdt sockfilter tc ksyscall
+
+CARGO ?= $(shell which cargo)
+ifeq ($(strip $(CARGO)),)
+BZS_APPS :=
+else
+BZS_APPS := # profile
+APPS += $(BZS_APPS)
+# Required by libblazesym
+ALL_LDFLAGS += -lrt -ldl -lpthread -lm
+endif
+
+# Get Clang's default includes on this system. We'll explicitly add these dirs
+# to the includes list when compiling with `-target bpf` because otherwise some
+# architecture-specific dirs will be "missing" on some architectures/distros -
+# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h,
+# sys/cdefs.h etc. might be missing.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
+
+ifeq ($(V),1)
+	Q =
+	msg =
+else
+	Q = @
+	msg = @printf '  %-8s %s%s\n'					\
+		      "$(1)"						\
+		      "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))"	\
+		      "$(if $(3), $(3))";
+	MAKEFLAGS += --no-print-directory
+endif
+
+define allow-override
+  $(if $(or $(findstring environment,$(origin $(1))),\
+            $(findstring command line,$(origin $(1)))),,\
+    $(eval $(1) = $(2)))
+endef
+
+$(call allow-override,CC,$(CROSS_COMPILE)cc)
+$(call allow-override,LD,$(CROSS_COMPILE)ld)
+
+.PHONY: all
+all: $(APPS) vec_add
+
+vec_add: vec_add.cu
+	@if command -v nvcc >/dev/null 2>&1; then \
+		nvcc -arch=sm_61 -cudart shared vec_add.cu -o vec_add -g; \
+	else \
+		echo "Warning: CUDA not found, skipping vec_add build"; \
+	fi
+
+.PHONY: clean
+clean:
+	$(call msg,CLEAN)
+	$(Q)rm -rf $(OUTPUT) $(APPS) vec_add
+
+$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT):
+	$(call msg,MKDIR,$@)
+	$(Q)mkdir -p $@
+
+# Build libbpf
+$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
+	$(call msg,LIB,$@)
+	$(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1		      \
+		    OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@)		      \
+		    INCLUDEDIR= LIBDIR= UAPIDIR=			      \
+		    install
+
+# Build bpftool
+$(BPFTOOL): | $(BPFTOOL_OUTPUT)
+	$(call msg,BPFTOOL,$@)
+	$(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap
+
+
+$(LIBBLAZESYM_SRC)/target/release/libblazesym.a::
+	$(Q)cd $(LIBBLAZESYM_SRC) && $(CARGO) build --features=cheader,dont-generate-test-files --release
+
+$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
+	$(call msg,LIB, $@)
+	$(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym.a $@
+
+$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
+	$(call msg,LIB,$@)
+	$(Q)cp $(LIBBLAZESYM_SRC)/target/release/blazesym.h $@
+
+# Build BPF code
+$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL)
+	$(call msg,BPF,$@)
+	$(Q)$(CLANG) -Xlinker --export-dynamic -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH)		      \
+		     $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES)		      \
+		     -c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
+	$(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
+
+# Generate BPF skeletons
+$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL)
+	$(call msg,GEN-SKEL,$@)
+	$(Q)$(BPFTOOL) gen skeleton $< > $@
+
+# Build user-space code
+$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h
+
+$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT)
+	$(call msg,CC,$@)
+	$(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@
+
+$(patsubst %,$(OUTPUT)/%.o,$(BZS_APPS)): $(LIBBLAZESYM_HEADER)
+
+$(BZS_APPS): $(LIBBLAZESYM_OBJ)
+
+# Build application binary
+$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
+	$(call msg,BINARY,$@)
+	$(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@
+
+# delete failed targets
+.DELETE_ON_ERROR:
+
+# keep intermediate (.skel.h, .bpf.o, etc) targets
+.SECONDARY:
diff --git a/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.bpf.c b/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.bpf.c
new file mode 100644
index 000000000..a6be3fee6
--- /dev/null
+++ b/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.bpf.c
@@ -0,0 +1,62 @@
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include "perthreadRuntimeDist.h"
+
+
+struct {
+    __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+    __uint(key_size, sizeof(__u32));
+    __uint(value_size, sizeof(__u32));
+} events SEC(".maps");
+
+// 用于记录 start time
+struct {
+    __uint(type, BPF_MAP_TYPE_HASH);
+    __uint(max_entries, 8192);
+    __type(key, u32);
+    __type(value, u64);
+} start SEC(".maps");
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
+
+// EXT helper index — 必须存在且可见
+static const u64 (*bpf_get_globaltimer)(void) = (void *)502;
+
+// GPU kernel entry
+SEC("kprobe/cudaLaunchKernel")
+int cuda__kernel_entry(struct pt_regs *ctx)
+{
+    u32 tid = bpf_get_current_pid_tgid();
+
+    u64 start_cycles = bpf_get_globaltimer(); // bpf_ktime_get_ns()?
+
+    bpf_map_update_elem(&start, &tid, &start_cycles, BPF_ANY);
+
+    return 0;
+}
+
+SEC("kretprobe/cudaLaunchKernel")
+int cuda__kernel_exit(struct pt_regs *ctx)
+{
+    u32 tid = bpf_get_current_pid_tgid();
+    u64 *start_cycles = bpf_map_lookup_elem(&start, &tid);
+
+    if (!start_cycles)
+        return 0;
+
+    u64 end_cycles = bpf_get_globaltimer();
+
+    struct event_t evt = {
+        .tid = tid,
+        .cycles = end_cycles - *start_cycles,
+    };
+
+    bpf_perf_event_output(ctx, &events,
+                          BPF_F_CURRENT_CPU,
+                          &evt,
+                          sizeof(evt));
+
+    bpf_map_delete_elem(&start, &tid);
+    return 0;
+}
diff --git a/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.c b/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.c
new file mode 100644
index 000000000..9d2d8c52c
--- /dev/null
+++ b/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.c
@@ -0,0 +1,61 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <bpf/libbpf.h>
+#include "perthreadRuntimeDist.skel.h"
+#include "perthreadRuntimeDist.h"
+
+static void handle_event(void *ctx, int cpu, void *data, __u32 size)
+{
+    struct event_t *e = data;
+    printf("[CPU %d] tid=%u cycles=%llu ns\n",
+           cpu, e->tid, e->cycles);
+}
+
+static void handle_lost(void *ctx, int cpu, __u64 lost)
+{
+    printf("LOST %llu events on CPU %d\n", lost, cpu);
+}
+
+int main()
+{
+    struct perthreadRuntimeDist_bpf *skel;
+    struct perf_buffer *pb;
+    int events_fd;
+
+    skel = perthreadRuntimeDist_bpf__open();
+    if (!skel) {
+        fprintf(stderr, "Failed to open BPF skeleton\n");
+        return 1;
+    }
+
+    if (perthreadRuntimeDist_bpf__load(skel)) {
+        fprintf(stderr, "Failed to load BPF skeleton\n");
+        return 1;
+    }
+
+    if (perthreadRuntimeDist_bpf__attach(skel)) {
+        fprintf(stderr, "Failed to attach BPF skeleton\n");
+        return 1;
+    }
+
+    printf("BPF attached successfully\n");
+
+    events_fd = bpf_map__fd(skel->maps.events);
+    pb = perf_buffer__new(events_fd, 16 /*buffer pages*/,
+                         handle_event, handle_lost, NULL, NULL);
+
+    if (!pb) {
+        fprintf(stderr, "Failed to open perf buffer\n");
+        return 1;
+    }
+
+    printf("Collecting data...\n");
+
+    while (1) {
+        int err = perf_buffer__poll(pb, 100 /*ms*/);
+        if (err < 0)
+            printf("perf_buffer__poll() error %d\n", err);
+    }
+
+    return 0;
+}
diff --git a/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.h b/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.h
new file mode 100644
index 000000000..fec960828
--- /dev/null
+++ b/example/gpu/perthreadRuntimeDist/perthreadRuntimeDist.h
@@ -0,0 +1,9 @@
+#ifndef __PERTHREAD_RUNTIME_DIST_H__
+#define __PERTHREAD_RUNTIME_DIST_H__
+
+struct event_t {
+    __u32 tid;
+    __u64 cycles;
+};
+
+#endif
diff --git a/example/gpu/perthreadRuntimeDist/thread_time_distribution_demo.cu b/example/gpu/perthreadRuntimeDist/thread_time_distribution_demo.cu
new file mode 100644
index 000000000..e988a80c8
--- /dev/null
+++ b/example/gpu/perthreadRuntimeDist/thread_time_distribution_demo.cu
@@ -0,0 +1,147 @@
+// thread_time_distribution_demo.cu
+#include <cstdio>
+#include <cstdlib>
+#include <cuda_runtime.h>
+
+// Simple CUDA error checking
+#define CHECK_CUDA(call)                                                   \
+    do {                                                                   \
+        cudaError_t err__ = (call);                                        \
+        if (err__ != cudaSuccess) {                                        \
+            fprintf(stderr, "CUDA error %s (%d) at %s:%d\n",               \
+                    cudaGetErrorString(err__), err__, __FILE__, __LINE__); \
+            std::exit(EXIT_FAILURE);                                       \
+        }                                                                  \
+    } while (0)
+
+// Per-thread timing info
+struct ThreadTime {
+    unsigned int blockId;     // block index (1D grid)
+    unsigned int threadId;    // global thread ID (1D)
+    unsigned long long cycles; // cycles spent in work section
+};
+
+// Kernel: each thread runs a synthetic workload, and we measure its time
+__global__ void timed_work_kernel(ThreadTime* out, int base_iters)
+{
+    const unsigned int blockId = blockIdx.x;
+    const unsigned int threadIdInBlock = threadIdx.x;
+    const unsigned int globalThreadId =
+        blockIdx.x * blockDim.x + threadIdx.x;
+
+    // Each thread runs a slightly different number of iterations
+    // to produce a non-trivial distribution.
+    int my_iters = base_iters + (globalThreadId & 0x1F); // vary per thread
+
+    // Time the "work" section using clock64()
+    unsigned long long start = clock64();
+
+    volatile float acc = 0.0f;
+    for (int i = 0; i < my_iters; ++i) {
+        acc += 1.0f;  // trivial arithmetic
+    }
+
+    unsigned long long end = clock64();
+
+    // Prevent the compiler from optimizing out the loop completely
+    if (acc == -1.0f) {
+        printf("This will never be printed\n");
+    }
+
+    ThreadTime t;
+    t.blockId  = blockId;
+    t.threadId = globalThreadId;
+    t.cycles   = end - start;
+
+    out[globalThreadId] = t;
+}
+
+int main()
+{
+    // Again, keep it small for printing
+    const int BLOCKS            = 4;
+    const int THREADS_PER_BLOCK = 64;
+    const int TOTAL_THREADS     = BLOCKS * THREADS_PER_BLOCK;
+
+    const int BASE_ITERS        = 100000; // base workload per thread
+
+    printf("=== Per-thread runtime distribution demo ===\n");
+    printf("Grid: %d blocks, Block: %d threads (total %d threads)\n",
+           BLOCKS, THREADS_PER_BLOCK, TOTAL_THREADS);
+
+    // Allocate device and host buffers
+    ThreadTime* d_times = nullptr;
+    CHECK_CUDA(cudaMalloc(&d_times, TOTAL_THREADS * sizeof(ThreadTime)));
+
+    ThreadTime* h_times = (ThreadTime*)std::malloc(TOTAL_THREADS * sizeof(ThreadTime));
+    if (!h_times) {
+        fprintf(stderr, "Host malloc failed\n");
+        std::exit(EXIT_FAILURE);
+    }
+
+    // Launch kernel
+    dim3 grid(BLOCKS);
+    dim3 block(THREADS_PER_BLOCK);
+
+    timed_work_kernel<<<grid, block>>>(d_times, BASE_ITERS);
+    CHECK_CUDA(cudaGetLastError());
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+    // Copy results back
+    CHECK_CUDA(cudaMemcpy(h_times, d_times,
+                          TOTAL_THREADS * sizeof(ThreadTime),
+                          cudaMemcpyDeviceToHost));
+
+    // Get device clock rate to convert cycles -> time
+    cudaDeviceProp prop;
+    CHECK_CUDA(cudaGetDeviceProperties(&prop, 0));
+
+    // clockRate: kHz (cycles per millisecond)
+    const double sm_freq_hz = (double)prop.clockRate * 1000.0;
+
+    printf("\nDevice: %s\n", prop.name);
+    printf("SM clock rate: %.3f MHz\n\n", sm_freq_hz / 1.0e6);
+
+    // Print per-thread timing
+    printf("Per-thread timing:\n");
+    printf("tid  block     cycles        time_us\n");
+    printf("--------------------------------------\n");
+
+    unsigned long long min_cycles = ~0ULL;
+    unsigned long long max_cycles = 0;
+    unsigned long long sum_cycles = 0;
+
+    for (int i = 0; i < TOTAL_THREADS; ++i) {
+        const ThreadTime& t = h_times[i];
+
+        double time_sec = (double)t.cycles / sm_freq_hz;
+        double time_us  = time_sec * 1.0e6;
+
+        printf("%3u  %5u  %10llu  %10.3f\n",
+               t.threadId, t.blockId,
+               (unsigned long long)t.cycles, time_us);
+
+        if (t.cycles < min_cycles) min_cycles = t.cycles;
+        if (t.cycles > max_cycles) max_cycles = t.cycles;
+        sum_cycles += t.cycles;
+    }
+
+    double avg_cycles = (double)sum_cycles / TOTAL_THREADS;
+    double min_us = (double)min_cycles / sm_freq_hz * 1.0e6;
+    double max_us = (double)max_cycles / sm_freq_hz * 1.0e6;
+    double avg_us = avg_cycles / sm_freq_hz * 1.0e6;
+
+    printf("\nSummary over all threads:\n");
+    printf("  min cycles = %llu (%.3f us)\n",
+           (unsigned long long)min_cycles, min_us);
+    printf("  max cycles = %llu (%.3f us)\n",
+           (unsigned long long)max_cycles, max_us);
+    printf("  avg cycles = %.1f (%.3f us)\n",
+           avg_cycles, avg_us);
+
+    // Cleanup
+    CHECK_CUDA(cudaFree(d_times));
+    std::free(h_times);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/example/gpu/perthreadRuntimeDist/vec_add b/example/gpu/perthreadRuntimeDist/vec_add
new file mode 100755
index 000000000..681c868ff
Binary files /dev/null and b/example/gpu/perthreadRuntimeDist/vec_add differ
diff --git a/example/gpu/perthreadRuntimeDist/vec_add.cu b/example/gpu/perthreadRuntimeDist/vec_add.cu
new file mode 100644
index 000000000..4413cb6ff
--- /dev/null
+++ b/example/gpu/perthreadRuntimeDist/vec_add.cu
@@ -0,0 +1,85 @@
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <iostream>
+#include <ostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+#include <unistd.h>
+#include <vector>
+
+/*
+nvcc -x cu -cuda vectorAdd.cu -o vectorAdd.cpp
+python filter_hashtag.py
+g++ vectorAdd-new.cpp -Wall -L /usr/local/cuda-12.6/lib64 -lcudart -o vectorAdd
+-g
+ */
+
+__constant__ int d_N;
+
+// A simple vector addition kernel
+__global__ void vectorAdd(const float *A, const float *B, float *C)
+{
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+	// Grid-stride loop to handle cases where total threads < N
+	while (idx < d_N) {
+		C[idx] = A[idx] + B[idx];
+		idx += blockDim.x * gridDim.x; // Grid stride
+	}
+}
+
+int main()
+{
+	// Set vector size in constant memory
+	const int h_N = 1 << 20; // 1M elements
+	cudaMemcpyToSymbol(d_N, &h_N, sizeof(h_N));
+
+	size_t bytes = h_N * sizeof(float);
+
+	// Allocate and initialize host memory using vectors
+	std::vector<float> h_A(h_N), h_B(h_N), h_C(h_N);
+
+	for (int i = 0; i < h_N; ++i) {
+		h_A[i] = float(i);
+		h_B[i] = float(2 * i);
+	}
+
+	// Allocate Device memory
+	float *d_A, *d_B, *d_C;
+	cudaMalloc(&d_A, bytes);
+	cudaMalloc(&d_B, bytes);
+	cudaMalloc(&d_C, bytes);
+
+	// Copy to device
+	cudaMemcpy(d_A, h_A.data(), bytes, cudaMemcpyHostToDevice);
+	cudaMemcpy(d_B, h_B.data(), bytes, cudaMemcpyHostToDevice);
+
+	// Run the kernel in an infinite loop
+	while (true) {
+		// Zero output array
+		cudaMemset(d_C, 0, bytes);
+
+		// Launch kernel
+		vectorAdd<<<dim3(512, 1, 1), dim3(1024, 1, 1)>>>(d_A, d_B, d_C);
+		cudaDeviceSynchronize();
+		// Copy result back to host
+		cudaMemcpy(h_C.data(), d_C, bytes, cudaMemcpyDeviceToHost);
+
+		// Print first element as a check
+		std::cout << "C[0] = " << h_C[0] << " (expected 0)\n";
+		std::cout << "C[1] = " << h_C[1] << " (expected 3)\n";
+		std::cout << "C[2] = " << h_C[2] << " (expected 6)\n";
+
+		// Sleep for 1 second
+		sleep(1);
+	}
+
+	// Note: This code will never reach cleanup due to infinite loop
+	cudaFree(d_A);
+	cudaFree(d_B);
+	cudaFree(d_C);
+
+	return 0;
+}