diff --git a/uarch_bench/Makefile b/uarch_bench/Makefile new file mode 100644 index 00000000..cf9bc179 --- /dev/null +++ b/uarch_bench/Makefile @@ -0,0 +1,42 @@ +CC = gcc +CFLAGS = -O3 -Wall -Wextra + +CUDA_ROOT_DIR=/usr/local/cuda-12.8 +CUDA_ARCH=sm_100 + +CUDA_LIB_DIR=-L$(CUDA_ROOT_DIR)/lib/cuda-12.8 +CUDA_INC_DIR=-I$(CUDA_ROOT_DIR)/include +NVCC = $(CUDA_ROOT_DIR)/bin/nvcc +NVCCFLAGS = -O3 -arch=$(CUDA_ARCH) $(CUDA_INC_DIR) $(CUDA_LIB_DIR) -Xcompiler "-Wall -Wextra" + +TARGET = frontend_study +CUDA_TARGET = fe_study_cuda +THROUGHPUT_TARGET = instr_throughput +BTB_TARGET = btb_estimate +BTB_CALLS_TARGET = btb_estimate_calls +OBJS = utils.o + +all: $(TARGET) $(CUDA_TARGET) $(THROUGHPUT_TARGET) $(BTB_TARGET) $(BTB_CALLS_TARGET) + +$(TARGET): frontend_study.c $(OBJS) + $(CC) $(CFLAGS) -o $(TARGET) frontend_study.c $(OBJS) + +$(CUDA_TARGET): fe_study_cuda.cu $(OBJS) + $(NVCC) $(NVCCFLAGS) -o $(CUDA_TARGET) fe_study_cuda.cu $(OBJS) + +$(THROUGHPUT_TARGET): instr_throughput.c $(OBJS) + $(CC) $(CFLAGS) -o $(THROUGHPUT_TARGET) instr_throughput.c $(OBJS) + +$(BTB_TARGET): btb_estimate.c $(OBJS) + $(CC) $(CFLAGS) -o $(BTB_TARGET) btb_estimate.c $(OBJS) + +$(BTB_CALLS_TARGET): btb_estimate_calls.c $(OBJS) + $(CC) $(CFLAGS) -o $(BTB_CALLS_TARGET) btb_estimate_calls.c $(OBJS) + +utils.o: utils.c utils.h + $(CC) $(CFLAGS) -c utils.c + +clean: + rm -f $(TARGET) $(CUDA_TARGET) $(THROUGHPUT_TARGET) $(BTB_TARGET) $(BTB_CALLS_TARGET) $(THROUGHPUT_DYNAMIC_TARGET) $(OBJS) + +.PHONY: all clean diff --git a/uarch_bench/README.md b/uarch_bench/README.md new file mode 100644 index 00000000..cb27892c --- /dev/null +++ b/uarch_bench/README.md @@ -0,0 +1,233 @@ +# Microarchitecture Benchmark Suite + +This directory contains a suite of microbenchmarks designed to measure CPU microarchitectural properties, with a focus on instruction frontend (fetch/decode) behavior, cache hierarchies, and branch prediction characteristics. + +## Overview + +These benchmarks help characterize processor behavior by executing synthetic workloads and collecting performance counter data via Linux perf. The suite supports multiple architectures (x86-64, ARM64). One microbenchmarks is specific to NVIDIA GPUs +and measures performance counters for kernel launches. + +## Microbenchmarks + +### 1. frontend_study + +**File:** `frontend_study.c` + +**Purpose:** Measure instruction frontend performance and cache behavior with configurable code layout. + +**What it measures:** +- iTLB misses (Instruction Translation Lookaside Buffer) +- L1I cache misses (Level 1 Instruction cache) +- Branch prediction misses +- L2 cache loads +- Cycles and instructions executed + +**Key features:** +- Creates multiple dynamically allocated memory regions containing function copies +- Functions are filled with architecture-specific NOPs (4-byte instructions) +- Adjustable function sizes: 16, 64, 256, 1024, 4096, 8192 bytes +- Two access patterns: + - **Sequential**: Calls functions in order modulo divisor + - **Random**: Uses PRNG for random function selection + +**Command-line parameters:** +``` +-d : Number of different functions to cycle through +-i : Total iterations of measurement loop +-b : Size of allocated memory regions (MB) +-n : Number of separate memory regions +-s : Page size for function alignment (KB) +-f : Function size in NOPs (16/64/256/1024/4096/8192) +-r : 0=sequential, 1=random function selection +``` + +**Example usage:** +```bash +./frontend_study -d 10 -i 1000000 -b 32 -n 256 -s 64 -f 1024 -r 0 +``` + +### 2. instr_throughput + +**File:** `instr_throughput.c` + +**Purpose:** Measure instruction fetch and decode throughput across varying code size ranges. + +**What it measures:** +- Cycles and instructions for code execution +- L1I cache misses +- iTLB misses +- L2 cache loads +- DRAM reads (LL cache misses) +- Bytes per cycle (throughput metric) + +**Key features:** +- Tests 19 different code sizes from 1KB to 1MB +- Dynamically generates executable code buffers filled with NOP instructions +- Per-iteration metric collection +- Auto-scales iterations based on code size (larger code = fewer iterations) + +**Typical test sizes:** 1K, 4K, 8K, 16K, 32K, 64K, 128K, 192K, 256K, 512K, 1M + +**Output:** Throughput metrics per iteration for each code size + +### 3. btb_estimate + +**File:** `btb_estimate.c` + +**Purpose:** Estimate Branch Target Buffer (BTB) capacity and behavior by measuring branch prediction performance. + +**What it measures:** +- Branch misses +- Instructions executed +- Misses per instruction, per iteration, per buffer entry + +**Key features:** +- Tests with buffer sizes from 256 to 262,144 entries (13 sizes) +- Buffer contains random 0s and 1s +- Based on buffer value, executes 1023 or 1024 NOPs (single-bit difference) +- Tracks when branch prediction fails +- Helps determine BTB capacity on the processor + +**Logic:** Larger buffer sizes that cause more misses indicate exceeding BTB capacity. + +### 4. btb_estimate_calls + +**File:** `btb_estimate_calls.c` + +**Purpose:** Estimate BTB capacity using indirect function calls instead of conditional branches. + +**What it measures:** +- Branch misses from call instructions +- Instructions executed +- Misses per iteration and per function pointer + +**Key features:** +- Allocates 128 to 4096 function pointers +- Each function pointer stored on a separate 64KB page +- All functions are identical (1024 NOPs + return) +- Measures branch prediction misses when calling through these pointers +- Random offsets within pages prevent trivial prediction + +**Pages tested:** 128 to 4096 pages (21 sizes) + +**Output:** Branch miss metrics as function count increases (identifies BTB saturation point) + +### 5. fe_study_cuda + +**File:** `fe_study_cuda.cu` + +**Purpose:** Study instruction frontend behavior on NVIDIA GPUs. + +**What it measures:** +- GPU cycles and instructions +- L1I cache misses on GPU +- Function-specific overhead measurement +- GPU instruction issue patterns + +**Features:** +- CUDA kernel compilation (sm_90 for x86-64, sm_100 for ARM) +- NOP-based synthetic workloads +- Profiles flush overhead for L1I cache + +**Target architecture:** NVIDIA GPUs + +## Supporting Files + +### utils.c / utils.h + +Provides common functionality for all benchmarks: +- **Perf counter abstraction**: Wraps Linux `perf_event_open` syscall +- **Counter sets**: iTLB, L1I, Branch, L2, DRAM reads +- **CPU frequency detection**: Reads from `/sys/devices/system/cpu/` or `/proc/cpuinfo` +- **Measurement results aggregation**: Structures for storing multi-counter data +- **LCG random number generator**: Deterministic RNG (`my_rand()`) + +### run_benchmark.sh + +Test harness for batch execution: +- Reads input configuration from a file +- Executes benchmarks with multiple parameter combinations +- Outputs results in CSV format + +### full_run_input.txt + +Sample configuration parameters for `frontend_study`: +- Tests varying divisors (16-512), iterations (10M-100M) +- Memory configurations: 1-512MB buffers with 64KB pages +- Function sizes: 16-8192 NOPs + +**Example row:** `10000000,100000000,32,256,64,16,0` +- Divisor=10M, Iterations=100M, Buffer=32MB, Buffers=256, Page=64KB, Function=16 NOPs, No random + +## Building + +```bash +make +``` + +The Makefile creates these executables: +1. `frontend_study` - CPU frontend study (gcc) +2. `fe_study_cuda` - CUDA version (nvcc) +3. `instr_throughput` - Throughput benchmark +4. `btb_estimate` - BTB sizing (branches) +5. `btb_estimate_calls` - BTB sizing (calls) + +Build configuration: +- Uses `gcc` for C benchmarks with `-O2` optimization +- Uses `nvcc` for CUDA benchmarks with architecture-specific targets +- Supports both x86-64 and ARM64 architectures + +## Usage Examples + +**Individual benchmark:** +```bash +./frontend_study -d 10 -i 1000000 -b 32 -n 256 -s 64 -f 1024 -r 0 +``` + +**Batch execution with script:** +```bash +./run_benchmark.sh full_run_input.txt > results.csv +``` + +**Run instruction throughput tests:** +```bash +./instr_throughput +``` + +**Estimate BTB capacity:** +```bash +./btb_estimate +./btb_estimate_calls +``` + +## Measurement Focus Areas + +| Benchmark | Primary Focus | Architecture | Key Metrics | +|-----------|--------------|--------------|-------------| +| frontend_study | Frontend/cache behavior | x86-64, ARM64 | iTLB, L1I, L2, Branch misses | +| instr_throughput | Code size scaling | x86-64, ARM64 | Throughput, cache misses | +| btb_estimate | BTB capacity (branches) | x86-64, ARM64 | Branch misses vs buffer size | +| btb_estimate_calls | BTB capacity (calls) | x86-64, ARM64 | Call misses vs function count | +| ARM BTB events | ARM64 | fe_study_cuda | GPU frontend | CUDA | GPU-specific patterns | + +## Performance Counter Requirements + +These benchmarks require access to Linux perf events. You may need to adjust perf_event_paranoid settings: + +```bash +# Check current setting +cat /proc/sys/kernel/perf_event_paranoid + +# Allow user access to performance counters (may require sudo) +echo 1 | sudo tee /proc/sys/kernel/perf_event_paranoid +``` + +## Use Cases + +This benchmark suite is useful for: +- Characterizing CPU microarchitecture behavior +- Identifying performance bottlenecks in instruction fetch and decode +- Understanding cache hierarchy characteristics +- Measuring branch prediction capabilities and BTB capacity +- Comparing performance across different processor generations +- GPU instruction frontend analysis diff --git a/uarch_bench/btb_estimate.c b/uarch_bench/btb_estimate.c new file mode 100644 index 00000000..502e3fef --- /dev/null +++ b/uarch_bench/btb_estimate.c @@ -0,0 +1,164 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include +#include +#include "utils.h" + +// Architecture-specific NOPs - both use 4-byte instructions +#if defined(__aarch64__) +#define ARCH_NOOP __asm__ __volatile__("nop\n\t"); // ARM64 NOP (4 bytes) +#elif defined(__x86_64__) +// x86-64 4-byte NOP: 0F 1F 40 00 (NOP DWORD PTR [RAX+0]) +#define ARCH_NOOP __asm__ __volatile__(".byte 0x0F, 0x1F, 0x40, 0x00\n\t"); +#else +#error "Unsupported architecture" +#endif + +// Build up NOPs hierarchically +#define ARCH_NOOP_2 ARCH_NOOP ARCH_NOOP +#define ARCH_NOOP_4 ARCH_NOOP_2 ARCH_NOOP_2 +#define ARCH_NOOP_8 ARCH_NOOP_4 ARCH_NOOP_4 +#define ARCH_NOOP_16 ARCH_NOOP_8 ARCH_NOOP_8 +#define ARCH_NOOP_32 ARCH_NOOP_16 ARCH_NOOP_16 +#define ARCH_NOOP_64 ARCH_NOOP_32 ARCH_NOOP_32 +#define ARCH_NOOP_128 ARCH_NOOP_64 ARCH_NOOP_64 +#define ARCH_NOOP_256 ARCH_NOOP_128 ARCH_NOOP_128 +#define ARCH_NOOP_512 ARCH_NOOP_256 ARCH_NOOP_256 +#define ARCH_NOOP_1024 ARCH_NOOP_512 ARCH_NOOP_512 + +// 1023 = 512 + 256 + 128 + 64 + 32 + 16 + 8 + 4 + 2 + 1 +#define ARCH_NOOP_1023 \ + ARCH_NOOP_512 ARCH_NOOP_256 ARCH_NOOP_128 ARCH_NOOP_64 ARCH_NOOP_32 \ + ARCH_NOOP_16 ARCH_NOOP_8 ARCH_NOOP_4 ARCH_NOOP_2 ARCH_NOOP + +// Function that processes the buffer +long long +process_buffer(int* buffer, int size, int iterations, PerfCounters* perf) { + long long sum = 0; + + // Warm-up: traverse 10 times and compute sum + for (int iter = 0; iter < 10; iter++) { + for (int i = 0; i < size; i++) { + sum += buffer[i]; + } + } + + // Start measuring + perf_counters_enable(perf); + + // Main measurement loop + for (int iter = 0; iter < iterations; iter++) { + for (int i = 0; i < size; i++) { + if (buffer[i] == 0) { + // Execute 1024 NOPs + ARCH_NOOP_1024 + } else { + // Execute 1023 NOPs + ARCH_NOOP_1023 + } + } + } + + // Stop measuring + perf_counters_disable_and_read(perf); + + return sum / 10; +} + +int main() { + int sizes[] = { + 256, + 512, + 1024, + 4096, + 6144, + 8192, + 10240, + 12288, + 16384, + 32768, + 65536, + 131072, + 262144}; + int iterations[] = { + 100000, + 100000, + 10000, + 10000, + 10000, + 1000, + 10000, + 1000, + 1000, + 1000, + 500, + 500, + 500}; + int num_sizes = sizeof(sizes) / sizeof(sizes[0]); + + // Initialize perf counters + PerfCounters perf; + if (perf_counters_init(&perf, COUNTER_SET_BRANCH, 0) != 0) { + fprintf(stderr, "Failed to initialize performance counters\n"); + return 1; + } + + // Seed random number generator + srand(42699642); + + printf("BTB Size Estimation\n"); + printf("===================\n\n"); + printf( + "%-10s %-12s %-15s %-15s %-15s %-15s %-15s\n", + "Size", + "Warmup Sum", + "Instructions", + "Branch Misses", + "Misses/Instr", + "Misses/Iter", + "Misses/Buffer Size"); + printf( + "---------------------------------------------------------------------------------------------\n"); + + for (int i = 0; i < num_sizes; i++) { + int size = sizes[i]; + + // Allocate and initialize buffer with random 0s and 1s + int* buffer = (int*)malloc(size * sizeof(int)); + if (!buffer) { + fprintf(stderr, "Memory allocation failed\n"); + perf_counters_cleanup(&perf); + return 1; + } + + for (int j = 0; j < size; j++) { + buffer[j] = rand() % 2; // Random 0 or 1 + } + + // Process buffer and measure + long long warmup_sum = process_buffer(buffer, size, iterations[i], &perf); + + double misses_per_instruction = + (double)perf.count_extra / perf.count_instructions; + double misses_per_iteration = (double)perf.count_extra / iterations[i]; + double misses_per_buffer_size = + (double)perf.count_extra / iterations[i] / size; + + printf( + "%-10d %-12lld %-15lld %-15lld %-15.6f %-15.2f %-15.2f\n", + size, + warmup_sum, + perf.count_instructions, + perf.count_extra, + misses_per_instruction, + misses_per_iteration, + misses_per_buffer_size); + + free(buffer); + } + + perf_counters_cleanup(&perf); + + return 0; +} diff --git a/uarch_bench/btb_estimate_calls.c b/uarch_bench/btb_estimate_calls.c new file mode 100644 index 00000000..118d8d47 --- /dev/null +++ b/uarch_bench/btb_estimate_calls.c @@ -0,0 +1,204 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include +#include +#include +#include +#include +#include "utils.h" + +// Architecture-specific NOPs - both use 4-byte instructions +#if defined(__aarch64__) +#define ARCH_NOOP __asm__ __volatile__("nop\n\t"); // ARM64 NOP (4 bytes) +#elif defined(__x86_64__) +// x86-64 4-byte NOP: 0F 1F 40 00 (NOP DWORD PTR [RAX+0]) +#define ARCH_NOOP __asm__ __volatile__(".byte 0x0F, 0x1F, 0x40, 0x00\n\t"); +#else +#error "Unsupported architecture" +#endif + +// Build up NOPs hierarchically +#define ARCH_NOOP_2 ARCH_NOOP ARCH_NOOP +#define ARCH_NOOP_4 ARCH_NOOP_2 ARCH_NOOP_2 +#define ARCH_NOOP_8 ARCH_NOOP_4 ARCH_NOOP_4 +#define ARCH_NOOP_16 ARCH_NOOP_8 ARCH_NOOP_8 +#define ARCH_NOOP_32 ARCH_NOOP_16 ARCH_NOOP_16 +#define ARCH_NOOP_64 ARCH_NOOP_32 ARCH_NOOP_32 +#define ARCH_NOOP_128 ARCH_NOOP_64 ARCH_NOOP_64 +#define ARCH_NOOP_256 ARCH_NOOP_128 ARCH_NOOP_128 +#define ARCH_NOOP_512 ARCH_NOOP_256 ARCH_NOOP_256 +#define ARCH_NOOP_1024 ARCH_NOOP_512 ARCH_NOOP_512 + +// 1023 = 512 + 256 + 128 + 64 + 32 + 16 + 8 + 4 + 2 + 1 +#define ARCH_NOOP_1023 \ + ARCH_NOOP_512 ARCH_NOOP_256 ARCH_NOOP_128 ARCH_NOOP_64 ARCH_NOOP_32 \ + ARCH_NOOP_16 ARCH_NOOP_8 ARCH_NOOP_4 ARCH_NOOP_2 ARCH_NOOP + +// Template function that executes 1024 NOPs and returns +__attribute__((noinline)) void nop_function_1024() { + ARCH_NOOP_1024 +} + +// Marker function to calculate size +__attribute__((noinline)) void nop_function_end() {} + +// Function that processes the buffer of function pointers +void process_buffer( + void (**buffer)(), + int size, + int iterations, + PerfCounters* perf) { + // Warm-up: traverse 10 times and call functions + for (int iter = 0; iter < 10; iter++) { + for (int i = 0; i < size; i++) { + buffer[i](); + } + } + + // Start measuring + perf_counters_enable(perf); + + // Main measurement loop - call function pointers + for (int iter = 0; iter < iterations; iter++) { + for (int i = 0; i < size; i++) { + buffer[i](); + } + } + + // Stop measuring + perf_counters_disable_and_read(perf); +} + +int main() { + int sizes[] = {128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, + 832, 896, 960, 1024, 1536, 2048, 2560, 3072, 3584, 4096}; + int iterations[] = {10000, 10000, 10000, 10000, 10000, 10000, 10000, + 10000, 10000, 10000, 10000, 10000, 10000, 10000, + 10000, 10000, 10000, 10000, 10000, 10000, 10000}; + int num_sizes = sizeof(sizes) / sizeof(sizes[0]); + const size_t PAGE_SIZE = 64 * 1024; // 64KB pages + const size_t ALIGNMENT = 16; // Function alignment + + // Calculate function size + size_t function_size = (char*)nop_function_end - (char*)nop_function_1024; + printf("Function size: %zu bytes\n", function_size); + + if (function_size > PAGE_SIZE) { + fprintf(stderr, "Error: Function size exceeds page size\n"); + return 1; + } + + // Initialize perf counters + PerfCounters perf; + if (perf_counters_init(&perf, COUNTER_SET_BRANCH, 0) != 0) { + fprintf(stderr, "Failed to initialize performance counters\n"); + return 1; + } + + // Seed random number generator + srand(time(NULL)); + + printf("BTB Size Estimation (Function Calls)\n"); + printf("=====================================\n\n"); + printf( + "%-10s %-10s %-15s %-15s %-15s %-15s\n", + "Size", + "Pages", + "Instructions", + "Branch Misses", + "Misses/Iter", + "Misses/Buffer Size"); + printf( + "------------------------------------------------------------------------------------------------------------\n"); + + for (int i = 0; i < num_sizes; i++) { + int size = sizes[i]; + + // Calculate number of pages needed (one function per page) + int num_pages = size; + + // Allocate array to hold memory regions + void** pages = malloc(num_pages * sizeof(void*)); + if (!pages) { + fprintf(stderr, "Failed to allocate pages array\n"); + perf_counters_cleanup(&perf); + return 1; + } + + // Allocate function pointer buffer + void (**buffer)() = malloc(size * sizeof(void (*)())); + if (!buffer) { + fprintf(stderr, "Failed to allocate buffer\n"); + free(pages); + perf_counters_cleanup(&perf); + return 1; + } + + // Allocate executable memory pages and copy functions + for (int j = 0; j < num_pages; j++) { + // Allocate executable page + pages[j] = mmap( + NULL, + PAGE_SIZE, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS, + -1, + 0); + + if (pages[j] == MAP_FAILED) { + fprintf( + stderr, + "Failed to allocate executable page %d for size %d\n", + j, + size); + // Cleanup previously allocated pages + for (int k = 0; k < j; k++) { + munmap(pages[k], PAGE_SIZE); + } + free(buffer); + free(pages); + perf_counters_cleanup(&perf); + return 1; + } + + // Calculate random offset within page (aligned) + size_t max_offset = PAGE_SIZE - function_size; + size_t offset = (rand() % (max_offset / ALIGNMENT)) * ALIGNMENT; + + // Copy function to this offset + void* dest = (char*)pages[j] + offset; + memcpy(dest, (void*)nop_function_1024, function_size); + + // Store function pointer in buffer + buffer[j] = (void (*)())dest; + } + + // Process buffer and measure + process_buffer(buffer, size, iterations[i], &perf); + + double misses_per_iteration = (double)perf.count_extra / iterations[i]; + double misses_per_buffer_size = + (double)perf.count_extra / iterations[i] / size; + + printf( + "%-10d %-10d %-15lld %-15lld %-15.2f %-15.6f\n", + size, + num_pages, + perf.count_instructions, + perf.count_extra, + misses_per_iteration, + misses_per_buffer_size); + + // Cleanup + for (int j = 0; j < num_pages; j++) { + munmap(pages[j], PAGE_SIZE); + } + free(buffer); + free(pages); + } + + perf_counters_cleanup(&perf); + + return 0; +} diff --git a/uarch_bench/fe_study_cuda.cu b/uarch_bench/fe_study_cuda.cu new file mode 100644 index 00000000..0516967c --- /dev/null +++ b/uarch_bench/fe_study_cuda.cu @@ -0,0 +1,316 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include +#include +#include +#include + +#include + +extern "C" { +#include "utils.h" +} + +__global__ void emptyKernel(const int n, float* __restrict__ gOutput) {} + +// Macros for unrolled NOP instructions (similar to frontend_study.c) +// +// AArch64 Instruction Encoding: +// All AArch64 instructions are fixed-length 32-bit (4 bytes) encodings. +// Reference: ARM Architecture Reference Manual ARMv8, Section C3.1 "A64 +// Instruction Set Encoding" +// https://developer.arm.com/documentation/ddi0487/latest (ARM ARM for ARMv8-A) +// +// Calculation: 2MB = 2,097,152 bytes / 4 bytes per instruction = 524,288 +// instructions +#define NOOP __asm__ __volatile__("nop\n\t"); +#define NOOP_4 NOOP NOOP NOOP NOOP +#define NOOP_16 NOOP_4 NOOP_4 NOOP_4 NOOP_4 +#define NOOP_64 NOOP_16 NOOP_16 NOOP_16 NOOP_16 +#define NOOP_256 NOOP_64 NOOP_64 NOOP_64 NOOP_64 +#define NOOP_1024 NOOP_256 NOOP_256 NOOP_256 NOOP_256 +#define NOOP_4096 NOOP_1024 NOOP_1024 NOOP_1024 NOOP_1024 +#define NOOP_16384 NOOP_4096 NOOP_4096 NOOP_4096 NOOP_4096 +#define NOOP_65536 NOOP_16384 NOOP_16384 NOOP_16384 NOOP_16384 +#define NOOP_262144 NOOP_65536 NOOP_65536 NOOP_65536 NOOP_65536 +#define NOOP_524288 NOOP_262144 NOOP_262144 + +// Function to flush instruction cache using NOP instructions + +__attribute__((noinline)) void flush_instruction_l1_cache_noop() { +#if defined(__aarch64__) + // ARM/AArch64: 16,384 NOPs (64KB) + NOOP_16384 +#elif defined(__x86_64__) || defined(__i386__) + // x86: 24K noops is 24KB + NOOP_16384 + NOOP_4096 + NOOP_4096 +#else + // Fallback: default to 16,384 NOPs + NOOP_16384 +#endif +} + +__attribute__((noinline)) void flush_for_flush_instruction_l1_cache_noop() { +#if defined(__aarch64__) + // ARM/AArch64: 16,384 NOPs (64KB) + NOOP_16384 +#elif defined(__x86_64__) || defined(__i386__) + // x86: 8,192 NOPs (32KB) + NOOP_4096 + NOOP_4096 +#else + // Fallback: default to 16,384 NOPs + NOOP_16384 +#endif +} + +// Structure to hold profiled flush overhead metrics +struct FlushOverhead { + double cycles_per_call; + double instructions_per_call; + double l1i_misses_per_call; +}; + +// Profile the flush_instruction_cache function to measure its overhead +FlushOverhead profile_flush_overhead(CounterSet counter_set) { + FlushOverhead overhead = {0.0, 0.0, 0.0}; + + // Initialize perf counters for the specified counter set + PerfCounters perf; + int perf_available = (perf_counters_init(&perf, counter_set, 0) == 0); + + if (!perf_available) { + printf("Warning: Performance counters not available for flush profiling\n"); + return overhead; + } + + // Profile provided flush function + printf("Profiling flush function overhead ...\n"); + + // Flush the L1I cache using a different function + flush_for_flush_instruction_l1_cache_noop(); + perf_counters_enable(&perf); + flush_instruction_l1_cache_noop(); + perf_counters_disable_and_read(&perf); + + // Calculate per-call overhead + overhead.cycles_per_call = (double)perf.count_cycles; + overhead.instructions_per_call = (double)perf.count_instructions; + overhead.l1i_misses_per_call = (double)perf.count_extra; + + printf( + "Flush overhead per call: Cycles: %.2f, Instructions: %.2f, Counter: %.2f\n", + overhead.cycles_per_call, + overhead.instructions_per_call, + overhead.l1i_misses_per_call); + + perf_counters_cleanup(&perf); + return overhead; +} + +std::vector timeLaunch( + const int numReps, + cudaStream_t stream, + const std::vector& gridSizes, + const FlushOverhead& flush_overhead, + CounterSet counter_set) { + std::vector timeUs; + + // Initialize perf counters for the specified counter set + PerfCounters perf; + int perf_available = (perf_counters_init(&perf, counter_set, 0) == 0); + if (!perf_available) { + printf( + "Warning: Performance counters not available for launch profiling\n"); + return timeUs; + } + + for (const auto& numBlocks : gridSizes) { + dim3 block(256); + dim3 grid(numBlocks); + cudaDeviceSynchronize(); + + // Reset and enable perf counters before measurement + perf_counters_enable(&perf); + + auto start = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < numReps; ++i) { + flush_instruction_l1_cache_noop(); + emptyKernel<<>>(numBlocks, nullptr); + } + + // Disable and read perf counters after measurement + perf_counters_disable_and_read(&perf); + cudaDeviceSynchronize(); + + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed = end - start; + timeUs.push_back(elapsed.count() / numReps); + + // Calculate per-launch metrics (for emptyKernel only) + double instructions_per_launch = perf.count_instructions / (double)numReps - + (double)flush_overhead.instructions_per_call; + double counter_per_launch = (double)perf.count_extra / (double)numReps - + (double)flush_overhead.l1i_misses_per_call; + double cycles_per_launch = (double)perf.count_cycles / (double)numReps; + double ipc = (perf.count_cycles > 0) + ? ((double)perf.count_instructions / (double)perf.count_cycles) + : 0.0; + + flush_instruction_l1_cache_noop(); + flush_instruction_l1_cache_noop(); + flush_instruction_l1_cache_noop(); + perf_counters_enable(&perf); + emptyKernel<<>>(numBlocks, nullptr); + perf_counters_disable_and_read(&perf); + + double instructions_per_launch_single = (double)perf.count_instructions; + double counter_per_launch_single = (double)perf.count_extra; + double cycles_per_launch_single = (double)perf.count_cycles; + double ipc_single = (perf.count_cycles > 0) + ? ((double)perf.count_instructions / (double)perf.count_cycles) + : 0.0; + + printf( + " Grid %6d: Repeat-Run: Instructions: %.2f, Counter: %.2f, Cycles (including overhead): %.2f, IPC: %.4f\n", + numBlocks, + instructions_per_launch, + counter_per_launch, + cycles_per_launch, + ipc); + + printf( + " Grid %6d: Single-Run: Instructions: %.2f, Counter: %.2f, Cycles (excluding overhead): %.2f, IPC: %.4f\n", + numBlocks, + instructions_per_launch_single, + counter_per_launch_single, + cycles_per_launch_single, + ipc_single); + } + + // Cleanup perf counters + if (perf_available) { + perf_counters_cleanup(&perf); + } + + return timeUs; +} + +int main(int argc, char* argv[]) { + int gpuIdx = 0; + int numReps = 10000; + int minBlocks = 1; + int maxBlocks = 128 * 1024; + bool warmUpLaunch = false; + int verbosityLevel = 1; + CounterSet counter_set = COUNTER_SET_L1I; // Default to L1I cache misses + + if (argc >= 2) { + gpuIdx = atoi(argv[1]); + } + if (argc >= 3) { + numReps = atoi(argv[2]); + } + if (argc >= 5) { + minBlocks = atoi(argv[3]); + maxBlocks = atoi(argv[4]); + } + if (argc >= 6) { + warmUpLaunch = (atoi(argv[5]) == 1); + } + if (argc >= 7) { + // Parse counter set argument + const char* counter_arg = argv[6]; + if (strcmp(counter_arg, "l1i") == 0 || strcmp(counter_arg, "L1I") == 0) { + counter_set = COUNTER_SET_L1I; + printf("Using L1 Instruction cache misses counter\n"); + } else if ( + strcmp(counter_arg, "itlb") == 0 || strcmp(counter_arg, "ITLB") == 0) { + counter_set = COUNTER_SET_ITLB; + printf("Using iTLB misses counter\n"); + } else if ( + strcmp(counter_arg, "l2") == 0 || strcmp(counter_arg, "L2") == 0) { + counter_set = COUNTER_SET_L2; + printf("Using L2 load instructions counter\n"); + } else { + fprintf( + stderr, + "Error: Invalid counter type '%s'. Valid options: l1i, itlb, l2\n", + counter_arg); + fprintf( + stderr, + "Usage: %s [gpuIdx] [numReps] [minBlocks] [maxBlocks] [warmUpLaunch] [counter_type]\n", + argv[0]); + fprintf( + stderr, + " counter_type: l1i (L1I cache misses), itlb (iTLB misses), l2 (L2 load instructions)\n"); + return 1; + } + } + + std::vector gridSizes; + for (int i = minBlocks; i <= maxBlocks; i *= 2) { + gridSizes.push_back(i); + } + + cudaSetDevice(gpuIdx); + cudaFree(0); + + cudaStream_t stream = 0; + cudaStreamCreate(&stream); + + // Warmp up launch, to get the code to GPU. + if (warmUpLaunch) { + emptyKernel<<<1, 1>>>(0, nullptr); + } + + // Profile selected flush function overhead + // Calculate the size of the flush function + void* flush_func_ptr = (void*)flush_instruction_l1_cache_noop; + void* flush_for_flush_func_ptr = + (void*)flush_for_flush_instruction_l1_cache_noop; + size_t flush_func_size = + (size_t)((char*)flush_for_flush_func_ptr - (char*)flush_func_ptr); + printf( + "Size of flush_instruction_l1_cache_noop function: %zu bytes\n", + flush_func_size); + + FlushOverhead flush_overhead = profile_flush_overhead(counter_set); + + // Measure launch latency for null stream. + printf("\n=== Measuring null stream launch latencies ===\n"); + auto nullStreamLaunchLatencies = + timeLaunch(numReps, 0, gridSizes, flush_overhead, counter_set); + + // Measure launch latency for non-null stream. + printf("\n=== Measuring non-null stream launch latencies ===\n"); + auto nonNullStreamLaunchLatencies = + timeLaunch(numReps, stream, gridSizes, flush_overhead, counter_set); + + assert( + nullStreamLaunchLatencies.size() == nonNullStreamLaunchLatencies.size()); + assert(nullStreamLaunchLatencies.size() == gridSizes.size()); + + if (verbosityLevel > 0) { + std::cout << gpuIdx << " " << numReps << " " << minBlocks << " " + << maxBlocks << " " << warmUpLaunch << std::endl; + printf(" CTAs null non-null\n"); + printf("----------------------\n"); + } + for (long unsigned int i = 0; i < gridSizes.size(); ++i) { + printf( + "%6d %6.2f %6.2f\n", + gridSizes[i], + nullStreamLaunchLatencies[i], + nonNullStreamLaunchLatencies[i]); + // std::cout << gridSizes[i] << " " << nullStreamLaunchLatencies[i] << " " + // << nonNullStreamLaunchLatencies[i] << std::endl; + } + + cudaStreamDestroy(stream); + + return 0; +} diff --git a/uarch_bench/frontend_study.c b/uarch_bench/frontend_study.c new file mode 100644 index 00000000..602f06fa --- /dev/null +++ b/uarch_bench/frontend_study.c @@ -0,0 +1,418 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include "utils.h" + +#define MISS1 \ + funcIndex = iter % divisor; \ + iter++; \ + functionPointers[funcIndex](); +#define MISS4 MISS1 MISS1 MISS1 MISS1 +#define MISS16 MISS4 MISS4 MISS4 MISS4 +#define MISS64 MISS16 MISS16 MISS16 MISS16 +#define MISS256 MISS64 MISS64 MISS64 MISS64 +#define MISS1024 MISS256 MISS256 MISS256 MISS256 + +#define RAND_MISS1 \ + funcIndex = my_rand() % divisor; \ + iter++; \ + functionPointers[funcIndex](); +#define RAND_MISS4 RAND_MISS1 RAND_MISS1 RAND_MISS1 RAND_MISS1 +#define RAND_MISS16 RAND_MISS4 RAND_MISS4 RAND_MISS4 RAND_MISS4 +#define RAND_MISS64 RAND_MISS16 RAND_MISS16 RAND_MISS16 RAND_MISS16 +#define RAND_MISS256 RAND_MISS64 RAND_MISS64 RAND_MISS64 RAND_MISS64 +#define RAND_MISS1024 RAND_MISS256 RAND_MISS256 RAND_MISS256 RAND_MISS256 + +// Architecture-specific NOPs - both use 4-byte instructions +#if defined(__aarch64__) +#define ARCH_NOOP __asm__ __volatile__("nop\n\t"); // ARM64 NOP (4 bytes) +#elif defined(__x86_64__) +// x86-64 4-byte NOP: 0F 1F 40 00 (NOP DWORD PTR [RAX+0]) +#define ARCH_NOOP __asm__ __volatile__(".byte 0x0F, 0x1F, 0x40, 0x00\n\t"); +#else +#error "Unsupported architecture" +#endif + +#define ARCH_NOOP_4 ARCH_NOOP ARCH_NOOP ARCH_NOOP ARCH_NOOP +#define ARCH_NOOP_16 ARCH_NOOP_4 ARCH_NOOP_4 ARCH_NOOP_4 ARCH_NOOP_4 +#define ARCH_NOOP_64 ARCH_NOOP_16 ARCH_NOOP_16 ARCH_NOOP_16 ARCH_NOOP_16 +#define ARCH_NOOP_256 ARCH_NOOP_64 ARCH_NOOP_64 ARCH_NOOP_64 ARCH_NOOP_64 +#define ARCH_NOOP_1024 ARCH_NOOP_256 ARCH_NOOP_256 ARCH_NOOP_256 ARCH_NOOP_256 +#define ARCH_NOOP_4096 \ + ARCH_NOOP_1024 ARCH_NOOP_1024 ARCH_NOOP_1024 ARCH_NOOP_1024 +#define ARCH_NOOP_8192 ARCH_NOOP_4096 ARCH_NOOP_4096 + +// Simple no-op functions of various sizes +__attribute__((noinline)) void targetFunction16() { + ARCH_NOOP_16 +} + +__attribute__((noinline)) void targetFunction64() { + ARCH_NOOP_64 +} + +__attribute__((noinline)) void targetFunction256() { + ARCH_NOOP_256 +} + +__attribute__((noinline)) void targetFunction1024() { + ARCH_NOOP_1024 +} + +__attribute__((noinline)) void targetFunction4096() { + ARCH_NOOP_4096 +} + +__attribute__((noinline)) void targetFunction8192() { + ARCH_NOOP_8192 +} + +// Marker function for size estimation +__attribute__((noinline)) void targetFunctionEnd() {} + +// Run the main measurement loop +void run_measurement_loop( + void (**functionPointers)(void), + int divisor, + unsigned long long iterations, + int use_random_jumps) { + size_t funcIndex = 0; + unsigned long long iter = 0; + + if (use_random_jumps) { + while (iter < iterations) { + RAND_MISS1024 + } + } else { + while (iter < iterations) { + MISS1024 + } + } +} + +// Print usage information +void print_usage(const char* program_name) { + fprintf( + stderr, + "Usage: %s -d -i -b -n -s -f -r \n", + program_name); +} + +int main(int argc, char* argv[]) { + // Initialize variables with invalid values to detect missing arguments + int divisor = -1; + unsigned long long iterations = 0; + int buffer_size_mb = -1; + int num_buffers = -1; + int page_kb = -1; + int use_random_jumps = -1; + int func_nops = -1; + const unsigned long code_alignment = 16; + + // Parse command line arguments + int opt; + while ((opt = getopt(argc, argv, "d:i:b:n:s:f:r:h")) != -1) { + switch (opt) { + case 'd': + divisor = atoi(optarg); + break; + case 'i': + iterations = atoll(optarg); + break; + case 'b': + buffer_size_mb = atoi(optarg); + break; + case 'n': + num_buffers = atoi(optarg); + break; + case 's': + page_kb = atoi(optarg); + break; + case 'f': + func_nops = atoi(optarg); + break; + case 'r': + use_random_jumps = atoi(optarg); + break; + case 'h': + print_usage(argv[0]); + return 0; + default: + print_usage(argv[0]); + return 1; + } + } + + // Validate all required arguments are present + if (divisor == -1 || iterations == 0 || buffer_size_mb == -1 || + num_buffers == -1 || page_kb == -1 || use_random_jumps == -1 || + func_nops == -1) { + fprintf(stderr, "Error: All arguments are required.\n\n"); + print_usage(argv[0]); + return 1; + } + + // Validate argument values + if (divisor <= 0) { + fprintf(stderr, "Error: Divisor must be a positive integer\n"); + return 1; + } + + if (iterations <= 0) { + fprintf(stderr, "Error: Iterations must be a positive integer\n"); + return 1; + } + + if (buffer_size_mb <= 0) { + fprintf(stderr, "Error: Buffer size (MB) must be a positive integer\n"); + return 1; + } + + if (num_buffers <= 0) { + fprintf(stderr, "Error: Number of buffers must be a positive integer\n"); + return 1; + } + + if (page_kb <= 0) { + fprintf(stderr, "Error: Page (KB) must be a positive integer\n"); + return 1; + } + + if (func_nops != 16 && func_nops != 64 && func_nops != 256 && + func_nops != 1024 && func_nops != 4096 && func_nops != 8192) { + fprintf(stderr, "Error: Function NOPs must be 16/64/256/1024/4096/8192\n"); + return 1; + } + + if (use_random_jumps != 0 && use_random_jumps != 1) { + fprintf(stderr, "Error: Random jumps must be 0 or 1\n"); + return 1; + } + + // Select function to use based on the requested NOP size + size_t functionSize = 0; + void (*targetFunction)(void) = ({ + void (*fn)(void) = NULL; + switch (func_nops) { + case 16: + fn = targetFunction16; + functionSize = (char*)targetFunction64 - (char*)targetFunction16; + break; + case 64: + fn = targetFunction64; + functionSize = (char*)targetFunction256 - (char*)targetFunction64; + break; + case 256: + fn = targetFunction256; + functionSize = (char*)targetFunction1024 - (char*)targetFunction256; + break; + case 1024: + fn = targetFunction1024; + functionSize = (char*)targetFunction4096 - (char*)targetFunction1024; + break; + case 4096: + fn = targetFunction4096; + functionSize = (char*)targetFunction8192 - (char*)targetFunction4096; + break; + case 8192: + fn = targetFunction8192; + functionSize = (char*)targetFunctionEnd - (char*)targetFunction8192; + break; + default: + // Fallback to the smallest function if an invalid size is somehow + // passed + fn = targetFunction16; + functionSize = (char*)targetFunction64 - (char*)targetFunction16; + break; + } + fn; + }); + + // Calculate derived values + unsigned long page_bytes = (unsigned long long)page_kb * 1024; + unsigned long long num_copies = (unsigned long long)num_buffers * + (unsigned long long)buffer_size_mb * 1024 / (unsigned long long)page_kb; + int num_regions = num_buffers; + + printf("Using divisor: %d\n", divisor); + printf("Using iterations: %llu\n", iterations); + printf( + "Allocating %d regions of %d MB each (total %d MB)...\n", + num_regions, + buffer_size_mb, + num_buffers * buffer_size_mb); + printf("Page: %d KB\n", page_kb); + printf("Number of function copies: %llu\n", num_copies); + printf("Estimated function size: %zu bytes\n", functionSize); + + if (num_copies < (unsigned long long)divisor) { + fprintf(stderr, "Warning: setting 'divisor' to the num_copies value.\n"); + divisor = num_copies; + } + + // Allocate array to hold pointers to each region + void** regions = malloc(num_regions * sizeof(void*)); + if (!regions) { + fprintf(stderr, "Failed to allocate regions array\n"); + return 1; + } + + // Map each region separately + const unsigned long long REGION_SIZE = + (unsigned long long)buffer_size_mb * 1024 * 1024; + printf("Mapping %d memory regions...\n", num_regions); + for (int i = 0; i < num_regions; i++) { + regions[i] = mmap( + NULL, + REGION_SIZE, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS, + -1, + 0); + + if (regions[i] == MAP_FAILED) { + fprintf(stderr, "Failed to allocate memory region %d\n", i); + // Cleanup already allocated regions + for (int j = 0; j < i; j++) { + munmap(regions[j], REGION_SIZE); + } + free(regions); + return 1; + } + } + printf("Successfully mapped %d regions\n", num_regions); + + // Allocate array to store function pointers + void (**functionPointers)(void) = malloc(num_copies * sizeof(void (*)(void))); + if (!functionPointers) { + fprintf(stderr, "Failed to allocate function pointer array\n"); + for (int i = 0; i < num_regions; i++) { + munmap(regions[i], REGION_SIZE); + } + free(regions); + return 1; + } + + // Copy function code to regions with page + printf("Copying function code...\n"); + for (unsigned long long i = 0; i < num_copies; i++) { + unsigned long intra_page_offset = my_rand() % + ((page_bytes - functionSize - 1) / code_alignment) * code_alignment; + + // Calculate total byte offset + unsigned long long total_offset = (i * page_bytes) + intra_page_offset; + + // Determine which region and offset within that region + int region_idx = total_offset / REGION_SIZE; + unsigned long long region_offset = total_offset % REGION_SIZE; + + // Calculate destination address + void* dest = (char*)regions[region_idx] + region_offset; + + memcpy(dest, (void*)targetFunction, functionSize); + functionPointers[i] = (void (*)(void))dest; + } + printf("Function copies created: %llu\n", num_copies); + + // Get CPU frequency (just once) + unsigned long long cpu_freq_hz = get_cpu_frequency_hz(); + printf("CPU Frequency: %.2f GHz\n\n", cpu_freq_hz / 1e9); + + // Structure to hold all measurement results + MeasurementResults results; + memset(&results, 0, sizeof(MeasurementResults)); + + // ========== Measurement 0: Timing only (no profiling) ========== + printf("=== Starting Measurement 0: Timing only (no profiling) ===\n"); + struct timespec start_time, end_time; + clock_gettime(CLOCK_MONOTONIC, &start_time); + run_measurement_loop(functionPointers, divisor, iterations, use_random_jumps); + clock_gettime(CLOCK_MONOTONIC, &end_time); + + // Calculate elapsed time in seconds + double elapsed_sec = (end_time.tv_sec - start_time.tv_sec) + + (end_time.tv_nsec - start_time.tv_nsec) / 1e9; + double iterations_per_sec = iterations / elapsed_sec; + + printf("Elapsed time: %.6f seconds\n", elapsed_sec); + printf("Iterations per second: %.2f M/s\n", iterations_per_sec / 1e6); + printf("Completed timing measurement\n\n"); + + // ========== Measurement 1: iTLB misses ========== + printf("=== Starting Measurement 1: iTLB misses ===\n"); + PerfCounters perf_itlb; + if (perf_counters_init(&perf_itlb, COUNTER_SET_ITLB, 0) == 0) { + perf_counters_enable(&perf_itlb); + run_measurement_loop( + functionPointers, divisor, iterations, use_random_jumps); + perf_counters_disable_and_read(&perf_itlb); + results.cycles_itlb = perf_itlb.count_cycles; + results.instructions_itlb = perf_itlb.count_instructions; + results.itlb_misses = perf_itlb.count_extra; + perf_counters_cleanup(&perf_itlb); + printf("Completed iTLB measurement\n\n"); + } + + // ========== Measurement 2: L1I cache misses ========== + printf("=== Starting Measurement 2: L1I cache misses ===\n"); + PerfCounters perf_l1i; + if (perf_counters_init(&perf_l1i, COUNTER_SET_L1I, 0) == 0) { + perf_counters_enable(&perf_l1i); + run_measurement_loop( + functionPointers, divisor, iterations, use_random_jumps); + perf_counters_disable_and_read(&perf_l1i); + results.cycles_l1i = perf_l1i.count_cycles; + results.instructions_l1i = perf_l1i.count_instructions; + results.l1i_misses = perf_l1i.count_extra; + perf_counters_cleanup(&perf_l1i); + printf("Completed L1I measurement\n\n"); + } + + // ========== Measurement 3: Branch misses ========== + printf("=== Starting Measurement 3: Branch misses ===\n"); + PerfCounters perf_branch; + if (perf_counters_init(&perf_branch, COUNTER_SET_BRANCH, 0) == 0) { + perf_counters_enable(&perf_branch); + run_measurement_loop( + functionPointers, divisor, iterations, use_random_jumps); + perf_counters_disable_and_read(&perf_branch); + results.cycles_branch = perf_branch.count_cycles; + results.instructions_branch = perf_branch.count_instructions; + results.branch_misses = perf_branch.count_extra; + perf_counters_cleanup(&perf_branch); + printf("Completed Branch measurement\n\n"); + } + + // ========== Measurement 4: L2 cache loads ========== + printf("=== Starting Measurement 4: L2 cache loads ===\n"); + PerfCounters perf_l2; + if (perf_counters_init(&perf_l2, COUNTER_SET_L2, 0) == 0) { + perf_counters_enable(&perf_l2); + run_measurement_loop( + functionPointers, divisor, iterations, use_random_jumps); + perf_counters_disable_and_read(&perf_l2); + results.cycles_l2 = perf_l2.count_cycles; + results.instructions_l2 = perf_l2.count_instructions; + results.l2_loads = perf_l2.count_extra; + perf_counters_cleanup(&perf_l2); + printf("Completed L2 measurement\n\n"); + } + + // Print all results + print_measurement_results(&results, iterations); + + // Cleanup + free(functionPointers); + for (int i = 0; i < num_regions; i++) { + munmap(regions[i], REGION_SIZE); + } + free(regions); + + return 0; +} diff --git a/uarch_bench/full_run_input.txt b/uarch_bench/full_run_input.txt new file mode 100644 index 00000000..8a518970 --- /dev/null +++ b/uarch_bench/full_run_input.txt @@ -0,0 +1,41 @@ +10000000,100000000,32,256,64,16,0 +10000000,100000000,32,256,64,64,0 +10000000,100000000,32,256,64,256,0 +10000000,100000000,32,256,64,1024,0 +10000000,10000000,32,256,64,4096,0 +10000000,10000000,32,256,64,8192,0 +10000000,100000000,16,128,64,16,0 +10000000,100000000,16,128,64,64,0 +10000000,100000000,16,128,64,256,0 +10000000,100000000,16,128,64,1024,0 +10000000,10000000,16,128,64,4096,0 +10000000,10000000,16,128,64,8192,0 +10000000,100000000,512,64,64,16,0 +10000000,100000000,512,64,64,64,0 +10000000,100000000,512,64,64,256,0 +10000000,100000000,512,64,64,1024,0 +10000000,10000000,512,64,64,4096,0 +10000000,10000000,512,64,64,8192,0 +10000000,10000000,16,64,64,16,0 +10000000,10000000,32,64,64,16,0 +10000000,10000000,64,64,64,16,0 +10000000,10000000,128,64,64,16,0 +10000000,10000000,256,64,64,16,0 +10000000,10000000,512,64,64,16,0 +10000000,10000000,1,20,1024,16,0 +10000000,10000000,1,25,1024,16,0 +10000000,10000000,1,30,1024,16,0 +10000000,10000000,1,32,1024,16,0 +10000000,10000000,1,36,1024,16,0 +10000000,10000000,1,40,1024,16,0 +10000000,10000000,1,42,1024,16,0 +10000000,10000000,1,44,1024,16,0 +10000000,10000000,1,48,1024,16,0 +10000000,10000000,1,64,1024,16,0 +10000000,10000000,1,128,1024,16,0 +10000000,10000000,1,256,1024,16,0 +10000000,10000000,1,512,1024,16,0 +10000000,10000000,1,1024,1024,16,0 +10000000,10000000,1,2048,1024,16,0 +10000000,10000000,1,4096,1024,16,0 +10000000,10000000,1,8192,1024,16,0 diff --git a/uarch_bench/instr_throughput.c b/uarch_bench/instr_throughput.c new file mode 100644 index 00000000..3f4f66f3 --- /dev/null +++ b/uarch_bench/instr_throughput.c @@ -0,0 +1,420 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include "utils.h" + +// Architecture-specific code generation +#if defined(__aarch64__) +#define NOP_INSTRUCTION 0xD503201F // ARM64 NOP instruction +#define RET_INSTRUCTION 0xD65F03C0 // ARM64 RET instruction +#define INSTRUCTION_SIZE 4 +#elif defined(__x86_64__) +// x86-64 4-byte NOP: 0F 1F 40 00 (NOP DWORD PTR [RAX+0]) +// Little-endian representation: bytes 0x0F, 0x1F, 0x40, 0x00 -> uint32_t +// 0x00401F0F +#define NOP_INSTRUCTION 0x00401F0F // x86-64 4-byte NOP instruction +#define RET_INSTRUCTION 0xC3 // x86-64 RET instruction (1 byte) +#define INSTRUCTION_SIZE 4 +#else +#error "Unsupported architecture" +#endif + +// Structure to hold an executable code buffer +typedef struct { + void* buffer; + size_t size; + void (*func)(void); +} CodeBuffer; + +// Structure to hold results for a single test +typedef struct { + const char* size_name; + unsigned long size_kb; + unsigned long num_nops; + CodeBuffer* code; + long long cycles; + long long instructions; + long long l1i_misses; + long long itlb_misses; + long long l2_loads; + long long dram_reads; + double bytes_per_cycle; +} TestResult; + +// Allocate executable memory buffer +CodeBuffer* create_code_buffer(size_t size_kb) { + CodeBuffer* code = (CodeBuffer*)malloc(sizeof(CodeBuffer)); + if (!code) { + fprintf(stderr, "Failed to allocate CodeBuffer structure\n"); + return NULL; + } + + size_t size_bytes = size_kb * 1024; + code->size = size_bytes; + + // Allocate executable memory using mmap + code->buffer = mmap( + NULL, + size_bytes, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS, + -1, + 0); + + if (code->buffer == MAP_FAILED) { + fprintf( + stderr, + "Failed to allocate executable memory of size %zu KB\n", + size_kb); + free(code); + return NULL; + } + + code->func = (void (*)(void))code->buffer; + return code; +} + +// Free code buffer +void free_code_buffer(CodeBuffer* code) { + if (code) { + if (code->buffer != MAP_FAILED) { + munmap(code->buffer, code->size); + } + free(code); + } +} + +// Generate executable code in buffer +// Returns the number of NOP instructions generated +unsigned long generate_code(CodeBuffer* code) { + if (!code || !code->buffer) { + return 0; + } + +#if defined(__aarch64__) + // ARM64: Generate code + uint32_t* ptr = (uint32_t*)code->buffer; + size_t num_instructions = code->size / sizeof(uint32_t); + + // Fill buffer with NOPs + for (size_t i = 0; i < num_instructions - 1; i++) { + ptr[i] = NOP_INSTRUCTION; + } + + // Add RET at the end + ptr[num_instructions - 1] = RET_INSTRUCTION; + + // Flush instruction cache for ARM + __builtin___clear_cache( + (char*)code->buffer, (char*)code->buffer + code->size); + + return num_instructions - 1; // All instructions except RET are NOPs + +#elif defined(__x86_64__) + // x86-64: Generate code with 4-byte NOPs + uint32_t* ptr = (uint32_t*)code->buffer; + size_t num_instructions = code->size / sizeof(uint32_t); + + // Fill buffer with 4-byte NOPs + for (size_t i = 0; i < num_instructions - 1; i++) { + ptr[i] = NOP_INSTRUCTION; + } + + // Add RET at the end (overwrite last NOP with RET in first byte) + uint8_t* ret_ptr = (uint8_t*)&ptr[num_instructions - 1]; + ret_ptr[0] = RET_INSTRUCTION; + + // x86 typically has coherent I-cache, but flush anyway + __builtin___clear_cache( + (char*)code->buffer, (char*)code->buffer + code->size); + + return num_instructions - 1; // All instructions except last are NOPs + +#endif +} + +// Run a single test with all counter sets +void run_test(TestResult* result, unsigned long iterations) { + PerfCounters perf; + + if (!result->code || !result->code->func) { + fprintf(stderr, "Invalid code buffer for test %s\n", result->size_name); + return; + } + + // Measure with L1I counter set + if (perf_counters_init(&perf, COUNTER_SET_L1I, 0) == 0) { + perf_counters_enable(&perf); + for (unsigned long i = 0; i < iterations; i++) { + result->code->func(); + } + perf_counters_disable_and_read(&perf); + result->cycles = perf.count_cycles; + result->instructions = perf.count_instructions; + result->l1i_misses = perf.count_extra; + perf_counters_cleanup(&perf); + } + + // Measure with iTLB counter set + if (perf_counters_init(&perf, COUNTER_SET_ITLB, 0) == 0) { + perf_counters_enable(&perf); + for (unsigned long i = 0; i < iterations; i++) { + result->code->func(); + } + perf_counters_disable_and_read(&perf); + result->itlb_misses = perf.count_extra; + perf_counters_cleanup(&perf); + } + + // Measure with L2 counter set + if (perf_counters_init(&perf, COUNTER_SET_L2, 0) == 0) { + perf_counters_enable(&perf); + for (unsigned long i = 0; i < iterations; i++) { + result->code->func(); + } + perf_counters_disable_and_read(&perf); + result->l2_loads = perf.count_extra; + perf_counters_cleanup(&perf); + } + + // Measure with DRAM reads counter set + if (perf_counters_init(&perf, COUNTER_SET_DRAM_READS, 0) == 0) { + perf_counters_enable(&perf); + for (unsigned long i = 0; i < iterations; i++) { + result->code->func(); + } + perf_counters_disable_and_read(&perf); + result->dram_reads = perf.count_extra; + perf_counters_cleanup(&perf); + } + + // Calculate bytes per cycle (instructions * instruction_size / cycles) + if (result->cycles > 0) { + result->bytes_per_cycle = + ((double)result->instructions * INSTRUCTION_SIZE) / + (double)result->cycles; + } else { + result->bytes_per_cycle = 0.0; + } +} + +// Print results in a table format +void print_results_header() { + printf("\n"); + printf( + "====================================================================================================\n"); + printf( + "%-12s %10s %12s %12s %10s %12s %12s %12s %12s\n", + "Size", + "NOPs", + "Cycles", + "Instructions", + "Bytes/Cycle", + "L1I Misses", + "iTLB Misses", + "L2 Loads", + "DRAM Reads"); + printf( + "====================================================================================================\n"); +} + +void print_result(const TestResult* result, unsigned long iterations) { + // Calculate per-iteration metrics + double cycles_per_iter = (double)result->cycles / iterations; + double instructions_per_iter = (double)result->instructions / iterations; + double l1i_per_iter = (double)result->l1i_misses / iterations; + double itlb_per_iter = (double)result->itlb_misses / iterations; + double l2_per_iter = (double)result->l2_loads / iterations; + double dram_per_iter = (double)result->dram_reads / iterations; + + printf( + "%-12s %10lu %12.0f %12.0f %10.4f %12.2f %12.2f %12.2f %12.2f\n", + result->size_name, + result->num_nops, + cycles_per_iter, + instructions_per_iter, + result->bytes_per_cycle, + l1i_per_iter, + itlb_per_iter, + l2_per_iter, + dram_per_iter); +} + +void print_usage(const char* program_name) { + fprintf( + stderr, + "Usage: %s [-i ] [-s ]\n", + program_name); + fprintf( + stderr, + " -i : Number of iterations per test (default: auto-calculated, multiples of 10)\n"); + fprintf( + stderr, + " -s : Test only specific size in KB (1, 4, 8, 16, 32, 64, 128, 192, 256, 512, 1024, 4096, 16384, 32768, 65536, 196608, 262144, 524288, 1048576)\n"); + fprintf(stderr, " -h: Show this help message\n"); +} + +int main(int argc, char* argv[]) { + unsigned long iterations = 0; // 0 means auto-calculate + long specific_size_kb = -1; // -1 means run all sizes + + // Parse command-line arguments + int opt; + while ((opt = getopt(argc, argv, "i:s:h")) != -1) { + switch (opt) { + case 'i': + iterations = strtoul(optarg, NULL, 10); + break; + case 's': + specific_size_kb = strtol(optarg, NULL, 10); + break; + case 'h': + print_usage(argv[0]); + return 0; + default: + print_usage(argv[0]); + return 1; + } + } + + // Define test sizes in KB + unsigned long test_sizes_kb[] = { + 1, + 4, + 8, + 16, + 32, + 64, + 128, + 192, + 256, + 512, + 1024, + 4096, + 16384, + 32768, + 65536, + 196608, + 262144, + 524288, + 1048576}; + int num_sizes = sizeof(test_sizes_kb) / sizeof(test_sizes_kb[0]); + + printf("Dynamic Instruction Throughput Benchmark\n"); + printf("Architecture: "); +#if defined(__aarch64__) || defined(__arm__) + printf("ARM/AArch64\n"); + printf("Instruction size: 4 bytes (NOP)\n"); +#elif defined(__x86_64__) || defined(__i386__) + printf("x86/x86_64\n"); + printf("Instruction size: 4 bytes (multi-byte NOP)\n"); +#else + printf("Unknown\n"); +#endif + + unsigned long cpu_freq = get_cpu_frequency_hz(); + printf("CPU Frequency: %.2f GHz\n", (double)cpu_freq / 1e9); + + // Allocate and generate code buffers + printf("\nAllocating and generating code buffers...\n"); + TestResult* tests = (TestResult*)calloc(num_sizes, sizeof(TestResult)); + if (!tests) { + fprintf(stderr, "Failed to allocate test results array\n"); + return 1; + } + + int num_tests = 0; + for (int i = 0; i < num_sizes; i++) { + unsigned long size_kb = test_sizes_kb[i]; + + // Skip if specific size requested and this isn't it + if (specific_size_kb != -1 && size_kb != (unsigned long)specific_size_kb) { + continue; + } + + printf(" Creating %lu KB code buffer...\n", size_kb); + + CodeBuffer* code = create_code_buffer(size_kb); + if (!code) { + fprintf(stderr, "Failed to create %lu KB buffer, skipping\n", size_kb); + continue; + } + + unsigned long num_nops = generate_code(code); + + // Create size name + char* size_name = (char*)malloc(32); + if (size_kb >= 1024) { + snprintf(size_name, 32, "%luM", size_kb / 1024); + } else { + snprintf(size_name, 32, "%luK", size_kb); + } + + tests[num_tests].size_name = size_name; + tests[num_tests].size_kb = size_kb; + tests[num_tests].num_nops = num_nops; + tests[num_tests].code = code; + tests[num_tests].cycles = 0; + tests[num_tests].instructions = 0; + tests[num_tests].l1i_misses = 0; + tests[num_tests].itlb_misses = 0; + tests[num_tests].l2_loads = 0; + tests[num_tests].dram_reads = 0; + tests[num_tests].bytes_per_cycle = 0.0; + + num_tests++; + } + + printf("\nRunning tests...\n"); + print_results_header(); + + // Run tests + for (int i = 0; i < num_tests; i++) { + // Auto-calculate iterations if not specified + // Use fewer iterations for larger functions to keep runtime reasonable + // Ensure iterations are multiples of 10 + unsigned long test_iterations = iterations; + if (test_iterations == 0) { + if (tests[i].size_kb <= 64) { + test_iterations = 1000000; + } else if (tests[i].size_kb <= 512) { + test_iterations = 100000; + } else if (tests[i].size_kb <= 4096) { + test_iterations = 10000; + } else if (tests[i].size_kb <= 65536) { + test_iterations = 1000; + } else { + test_iterations = 100; + } + + // Ensure it's a multiple of 10 + test_iterations = (test_iterations / 10) * 10; + if (test_iterations == 0) { + test_iterations = 10; + } + } + + run_test(&tests[i], test_iterations); + print_result(&tests[i], test_iterations); + } + + printf( + "====================================================================================================\n"); + printf("\nNote: All metrics shown are per-iteration averages.\n"); + + // Cleanup + for (int i = 0; i < num_tests; i++) { + free_code_buffer(tests[i].code); + free((void*)tests[i].size_name); + } + free(tests); + + return 0; +} diff --git a/uarch_bench/run_benchmark.sh b/uarch_bench/run_benchmark.sh new file mode 100644 index 00000000..5012a4b9 --- /dev/null +++ b/uarch_bench/run_benchmark.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# Script to run frontend_study with various parameters and collect performance metrics +# Input: Text file with comma-separated values (one configuration per line) +# Output: CSV format with performance metrics + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +INPUT_FILE="$1" + +if [ ! -f "$INPUT_FILE" ]; then + echo "Error: Input file '$INPUT_FILE' not found" + exit 1 +fi + +# Get the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +EXECUTABLE="$SCRIPT_DIR/frontend_study" + +if [ ! -x "$EXECUTABLE" ]; then + echo "Error: Executable '$EXECUTABLE' not found or not executable" + exit 1 +fi + +# Print CSV header +echo "buffer_size_mb,num_buffers,function_size_bytes,elapsed_time,iterations_per_sec,cycles,instructions,l1_cache_misses_per_iter,l1_tlb_misses_per_iter,branch_misses_per_iter" + +# Process each line in the input file +while IFS= read -r line; do + # Skip empty lines + [[ -z "$line" ]] && continue + + # Parse comma-separated values + IFS=',' read -r divisor iterations buffer_size num_buffers page_size func_sel random_jumps <<< "$line" + + # Trim whitespace + divisor=$(echo "$divisor" | xargs) + iterations=$(echo "$iterations" | xargs) + buffer_size=$(echo "$buffer_size" | xargs) + num_buffers=$(echo "$num_buffers" | xargs) + page_size=$(echo "$page_size" | xargs) + func_sel=$(echo "$func_sel" | xargs) + random_jumps=$(echo "$random_jumps" | xargs) + + # Run the executable with the parsed parameters + RUN_OUTPUT=$($EXECUTABLE -d $divisor -i $iterations -b $buffer_size -n $num_buffers -s $page_size -f $func_sel -r $random_jumps 2>&1) + + # Extract performance metrics from the output + FUNCTION_SIZE=$(echo "$RUN_OUTPUT" | grep -oP 'Estimated function size:\s+\K[0-9]+' || echo "0") + ELAPSED_TIME=$(echo "$RUN_OUTPUT" | grep -oP 'Elapsed time:\s+\K[0-9.]+' || echo "0") + ITER_PER_SEC=$(echo "$RUN_OUTPUT" | grep -oP 'Iterations per second:\s+\K[0-9.]+' || echo "0") + + # Get cycles and instructions + CYCLES=$(echo "$RUN_OUTPUT" | grep -oP 'Cycles:\s+\K[0-9]+' | tr -d ',' | head -n 1 || echo "0") + INSTRUCTIONS=$(echo "$RUN_OUTPUT" | grep -oP 'Instructions:\s+\K[0-9]+' | tr -d ',' | head -n 1 || echo "0") + + # Get L1 cache misses, L1 TLB misses, and branch misses + L1_CACHE_MISSES_PER_ITER=$(echo "$RUN_OUTPUT" | grep -oP 'L1I Misses / Iteration:\s+\K[0-9]+(?:\.[0-9]+)?' | tr -d ',' || echo "0") + L1_TLB_MISSES_PER_ITER=$(echo "$RUN_OUTPUT" | grep -oP 'iTLB Misses / Iteration:\s+\K[0-9]+(?:\.[0-9]+)?' | tr -d ',' || echo "0") + BRANCH_MISSES_PER_ITER=$(echo "$RUN_OUTPUT" | grep -oP 'Branch Misses / Iteration:\s+\K[0-9]+(?:\.[0-9]+)?' | tr -d ',' || echo "0") + + # Output CSV row + echo "$buffer_size,$num_buffers,$FUNCTION_SIZE,$ELAPSED_TIME,$ITER_PER_SEC,$CYCLES,$INSTRUCTIONS,$L1_CACHE_MISSES_PER_ITER,$L1_TLB_MISSES_PER_ITER,$BRANCH_MISSES_PER_ITER" + +done < "$INPUT_FILE" diff --git a/uarch_bench/utils.c b/uarch_bench/utils.c new file mode 100644 index 00000000..7275f9ab --- /dev/null +++ b/uarch_bench/utils.c @@ -0,0 +1,272 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#define _GNU_SOURCE +#include "utils.h" +#include +#include +#include +#include +#include +#include +#include +#include + +// Wrapper for perf_event_open syscall +static long perf_event_open( + struct perf_event_attr* hw_event, + pid_t pid, + int cpu, + int group_fd, + unsigned long flags) { + return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); +} + +// Setup a perf counter (internal helper) +static int setup_perf_counter(uint32_t type, uint64_t config, int group_fd) { + struct perf_event_attr pe; + memset(&pe, 0, sizeof(struct perf_event_attr)); + pe.type = type; + pe.size = sizeof(struct perf_event_attr); + pe.config = config; + pe.disabled = 1; // Start disabled + pe.exclude_kernel = 1; + pe.exclude_hv = 1; + + int fd = perf_event_open(&pe, 0, -1, group_fd, 0); + if (fd == -1) { + fprintf( + stderr, + "Error opening perf counter (type=%u, config=%llu): %s\n", + type, + (unsigned long long)config, + strerror(errno)); + } + return fd; +} + +// Initialize perf counters +int perf_counters_init( + PerfCounters* perf, + CounterSet counter_set, + int verbose) { + perf->cpu_freq_hz = get_cpu_frequency_hz(); + perf->counter_set = counter_set; + + const char* counter_name = ""; + uint64_t extra_config = 0; + uint32_t extra_type = PERF_TYPE_HARDWARE; + + // Configure the extra counter based on counter set + switch (counter_set) { + case COUNTER_SET_ITLB: + counter_name = "iTLB-load-misses"; + extra_type = PERF_TYPE_HW_CACHE; + extra_config = (PERF_COUNT_HW_CACHE_ITLB) | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16); + break; + case COUNTER_SET_L1I: + counter_name = "L1I-cache-load-misses"; + extra_type = PERF_TYPE_HW_CACHE; + extra_config = (PERF_COUNT_HW_CACHE_L1I) | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16); + break; + case COUNTER_SET_BRANCH: + counter_name = "branch-misses"; + extra_type = PERF_TYPE_HARDWARE; + extra_config = PERF_COUNT_HW_BRANCH_MISSES; + break; + case COUNTER_SET_L2: + counter_name = "L2/LL-cache-loads"; + extra_type = PERF_TYPE_HW_CACHE; + // Note: PERF_COUNT_HW_CACHE_LL refers to Last Level Cache + // On most ARM systems, this is L2 cache (or L3 if present) + extra_config = (PERF_COUNT_HW_CACHE_LL) | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16); + break; + case COUNTER_SET_DRAM_READS: + counter_name = "DRAM-reads"; + extra_type = PERF_TYPE_HW_CACHE; + // Measure last-level cache misses which indicate DRAM reads + extra_config = (PERF_COUNT_HW_CACHE_LL) | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16); + break; + } + + if (verbose == 1) { + printf( + "Setting up perf counters: cycles, instructions, %s\n", counter_name); + } + + perf->fd_cycles = + setup_perf_counter(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, -1); + perf->fd_instructions = setup_perf_counter( + PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, perf->fd_cycles); + perf->fd_extra = + setup_perf_counter(extra_type, extra_config, perf->fd_cycles); + + if (perf->fd_cycles == -1 || perf->fd_instructions == -1 || + perf->fd_extra == -1) { + fprintf( + stderr, + "Warning: Some perf counters unavailable, continuing without them\n"); + perf->available = 0; + return -1; + } + + perf->available = 1; + perf->count_cycles = 0; + perf->count_instructions = 0; + perf->count_extra = 0; + return 0; +} + +// Enable perf counters +void perf_counters_enable(PerfCounters* perf) { + if (!perf->available) { + return; + } + + ioctl(perf->fd_cycles, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); + ioctl(perf->fd_cycles, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); +} + +// Disable perf counters and read results +void perf_counters_disable_and_read(PerfCounters* perf) { + if (!perf->available) { + return; + } + + ioctl(perf->fd_cycles, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP); + + read(perf->fd_cycles, &perf->count_cycles, sizeof(long long)); + read(perf->fd_instructions, &perf->count_instructions, sizeof(long long)); + read(perf->fd_extra, &perf->count_extra, sizeof(long long)); +} + +// Print all measurement results +void print_measurement_results( + const MeasurementResults* results, + unsigned long long iterations) { + printf("\n=== Measurement Results ===\n"); + printf("Iterations: %llu\n\n", iterations); + + // iTLB measurements + printf("--- iTLB Measurement ---\n"); + printf("Cycles: %lld\n", results->cycles_itlb); + printf("Instructions: %lld\n", results->instructions_itlb); + printf("iTLB Load Misses: %lld\n", results->itlb_misses); + if (iterations > 0) { + double itlb_per_iter = (double)results->itlb_misses / (double)iterations; + double cycles_per_iter = (double)results->cycles_itlb / (double)iterations; + printf("iTLB Misses / Iteration: %.6f\n", itlb_per_iter); + printf("Cycles / Iteration: %.6f\n", cycles_per_iter); + } + + // L1I measurements + printf("\n--- L1I Cache Measurement ---\n"); + printf("Cycles: %lld\n", results->cycles_l1i); + printf("Instructions: %lld\n", results->instructions_l1i); + printf("L1I Cache Load Misses: %lld\n", results->l1i_misses); + if (iterations > 0) { + double l1i_per_iter = (double)results->l1i_misses / (double)iterations; + double cycles_per_iter = (double)results->cycles_l1i / (double)iterations; + printf("L1I Misses / Iteration: %.6f\n", l1i_per_iter); + printf("Cycles / Iteration: %.6f\n", cycles_per_iter); + } + + // Branch measurements + printf("\n--- Branch Prediction Measurement ---\n"); + printf("Cycles: %lld\n", results->cycles_branch); + printf("Instructions: %lld\n", results->instructions_branch); + printf("Branch Misses: %lld\n", results->branch_misses); + if (iterations > 0) { + double branch_per_iter = + (double)results->branch_misses / (double)iterations; + double cycles_per_iter = + (double)results->cycles_branch / (double)iterations; + printf("Branch Misses / Iteration: %.6f\n", branch_per_iter); + printf("Cycles / Iteration: %.6f\n", cycles_per_iter); + } + + // L2 Cache measurements + // Note: ARM cache line size is typically 64 bytes + // Reference: ARM Architecture Reference Manual, most ARM cores use 64-byte + // cache lines + printf("\n--- L2 Cache Measurement ---\n"); + printf("Cycles: %lld\n", results->cycles_l2); + printf("Instructions: %lld\n", results->instructions_l2); + printf("L2 Cache Loads: %lld\n", results->l2_loads); + if (iterations > 0) { + const long long CACHE_LINE_SIZE = 64; // bytes + double l2_loads_per_iter = (double)results->l2_loads / (double)iterations; + double l2_bytes_per_iter = l2_loads_per_iter * CACHE_LINE_SIZE; + double cycles_per_iter = (double)results->cycles_l2 / (double)iterations; + printf("L2 Loads / Iteration: %.6f\n", l2_loads_per_iter); + printf("L2 Bytes Loaded / Iteration: %.2f bytes\n", l2_bytes_per_iter); + printf("Cycles / Iteration: %.6f\n", cycles_per_iter); + } +} + +// Cleanup perf counters +void perf_counters_cleanup(PerfCounters* perf) { + if (!perf->available) { + return; + } + + close(perf->fd_cycles); + close(perf->fd_instructions); + close(perf->fd_extra); +} + +// Read CPU frequency in Hz +unsigned long long get_cpu_frequency_hz(void) { + FILE* fp; + unsigned long long freq_khz = 0; + + // Try to read from sysfs (frequency in kHz) + fp = fopen("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", "r"); + if (fp) { + if (fscanf(fp, "%llu", &freq_khz) == 1) { + fclose(fp); + return freq_khz * 1000; // Convert kHz to Hz + } + fclose(fp); + } + + // Fallback: try to read from /proc/cpuinfo + fp = fopen("/proc/cpuinfo", "r"); + if (fp) { + char line[256]; + while (fgets(line, sizeof(line), fp)) { + // Look for "cpu MHz" line + if (strstr(line, "cpu MHz")) { + float freq_mhz = 0; + if (sscanf(line, "cpu MHz : %f", &freq_mhz) == 1) { + fclose(fp); + return (unsigned long long)(freq_mhz * 1000000); // Convert MHz to Hz + } + } + } + fclose(fp); + } + + fprintf( + stderr, "Warning: Could not read CPU frequency, using default 2.0 GHz\n"); + return 2000000000ULL; // Default to 2 GHz +} + +// Simple inline random number generator (LCG) +static unsigned long rand_state = 1; // 42069420; + +void my_srand(unsigned long seed) { + rand_state = seed; +} + +unsigned long my_rand(void) { + rand_state = rand_state * 1103515245 + 12345; + return (rand_state / 65536) % 32768; +} diff --git a/uarch_bench/utils.h b/uarch_bench/utils.h new file mode 100644 index 00000000..dd028f1b --- /dev/null +++ b/uarch_bench/utils.h @@ -0,0 +1,61 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#ifndef UTILS_H +#define UTILS_H + +// Counter type for different measurement passes +typedef enum { + COUNTER_SET_ITLB, + COUNTER_SET_L1I, + COUNTER_SET_BRANCH, + COUNTER_SET_L2, + COUNTER_SET_DRAM_READS +} CounterSet; + +// Perf counter management struct +typedef struct { + int fd_cycles; + int fd_instructions; + int fd_extra; // Can be iTLB, L1I, or branch depending on counter set + int available; + long long count_cycles; + long long count_instructions; + long long count_extra; + unsigned long long cpu_freq_hz; + CounterSet counter_set; +} PerfCounters; + +// Perf counter functions +int perf_counters_init(PerfCounters* perf, CounterSet counter_set, int verbose); +void perf_counters_enable(PerfCounters* perf); +void perf_counters_disable_and_read(PerfCounters* perf); +void perf_counters_cleanup(PerfCounters* perf); + +// Results structure to hold all measurements +typedef struct { + long long cycles_itlb; + long long instructions_itlb; + long long itlb_misses; + long long cycles_l1i; + long long instructions_l1i; + long long l1i_misses; + long long cycles_branch; + long long instructions_branch; + long long branch_misses; + long long cycles_l2; + long long instructions_l2; + long long l2_loads; +} MeasurementResults; + +void print_measurement_results( + const MeasurementResults* results, + unsigned long long iterations); + +// CPU frequency detection +unsigned long long get_cpu_frequency_hz(void); + +// Random number generator +void my_srand(unsigned long seed); +unsigned long my_rand(void); + +#endif // UTILS_H