diff --git a/uarch_bench/Makefile b/uarch_bench/Makefile
new file mode 100644
index 00000000..cf9bc179
--- /dev/null
+++ b/uarch_bench/Makefile
@@ -0,0 +1,42 @@
+CC = gcc
+CFLAGS = -O3 -Wall -Wextra
+
+CUDA_ROOT_DIR=/usr/local/cuda-12.8
+CUDA_ARCH=sm_100
+
+CUDA_LIB_DIR=-L$(CUDA_ROOT_DIR)/lib/cuda-12.8
+CUDA_INC_DIR=-I$(CUDA_ROOT_DIR)/include
+NVCC = $(CUDA_ROOT_DIR)/bin/nvcc
+NVCCFLAGS = -O3 -arch=$(CUDA_ARCH) $(CUDA_INC_DIR) $(CUDA_LIB_DIR) -Xcompiler "-Wall -Wextra"
+
+TARGET = frontend_study
+CUDA_TARGET = fe_study_cuda
+THROUGHPUT_TARGET = instr_throughput
+BTB_TARGET = btb_estimate
+BTB_CALLS_TARGET = btb_estimate_calls
+OBJS = utils.o
+
+all: $(TARGET) $(CUDA_TARGET) $(THROUGHPUT_TARGET) $(BTB_TARGET) $(BTB_CALLS_TARGET)
+
+$(TARGET): frontend_study.c $(OBJS)
+	$(CC) $(CFLAGS) -o $(TARGET) frontend_study.c $(OBJS)
+
+$(CUDA_TARGET): fe_study_cuda.cu $(OBJS)
+	$(NVCC) $(NVCCFLAGS) -o $(CUDA_TARGET) fe_study_cuda.cu $(OBJS)
+
+$(THROUGHPUT_TARGET): instr_throughput.c $(OBJS)
+	$(CC) $(CFLAGS) -o $(THROUGHPUT_TARGET) instr_throughput.c $(OBJS)
+
+$(BTB_TARGET): btb_estimate.c $(OBJS)
+	$(CC) $(CFLAGS) -o $(BTB_TARGET) btb_estimate.c $(OBJS)
+
+$(BTB_CALLS_TARGET): btb_estimate_calls.c $(OBJS)
+	$(CC) $(CFLAGS) -o $(BTB_CALLS_TARGET) btb_estimate_calls.c $(OBJS)
+
+utils.o: utils.c utils.h
+	$(CC) $(CFLAGS) -c utils.c
+
+clean:
+	rm -f $(TARGET) $(CUDA_TARGET) $(THROUGHPUT_TARGET) $(BTB_TARGET) $(BTB_CALLS_TARGET) $(THROUGHPUT_DYNAMIC_TARGET) $(OBJS)
+
+.PHONY: all clean
diff --git a/uarch_bench/README.md b/uarch_bench/README.md
new file mode 100644
index 00000000..cb27892c
--- /dev/null
+++ b/uarch_bench/README.md
@@ -0,0 +1,233 @@
+# Microarchitecture Benchmark Suite
+
+This directory contains a suite of microbenchmarks designed to measure CPU microarchitectural properties, with a focus on instruction frontend (fetch/decode) behavior, cache hierarchies, and branch prediction characteristics.
+
+## Overview
+
+These benchmarks help characterize processor behavior by executing synthetic workloads and collecting performance counter data via Linux perf. The suite supports multiple architectures (x86-64, ARM64). One microbenchmarks is specific to NVIDIA GPUs
+and measures performance counters for kernel launches.
+
+## Microbenchmarks
+
+### 1. frontend_study
+
+**File:** `frontend_study.c`
+
+**Purpose:** Measure instruction frontend performance and cache behavior with configurable code layout.
+
+**What it measures:**
+- iTLB misses (Instruction Translation Lookaside Buffer)
+- L1I cache misses (Level 1 Instruction cache)
+- Branch prediction misses
+- L2 cache loads
+- Cycles and instructions executed
+
+**Key features:**
+- Creates multiple dynamically allocated memory regions containing function copies
+- Functions are filled with architecture-specific NOPs (4-byte instructions)
+- Adjustable function sizes: 16, 64, 256, 1024, 4096, 8192 bytes
+- Two access patterns:
+  - **Sequential**: Calls functions in order modulo divisor
+  - **Random**: Uses PRNG for random function selection
+
+**Command-line parameters:**
+```
+-d <divisor>          : Number of different functions to cycle through
+-i <iterations>       : Total iterations of measurement loop
+-b <buffer_size_MB>   : Size of allocated memory regions (MB)
+-n <num_buffers>      : Number of separate memory regions
+-s <page_KB>          : Page size for function alignment (KB)
+-f <func_nops>        : Function size in NOPs (16/64/256/1024/4096/8192)
+-r <random_jumps>     : 0=sequential, 1=random function selection
+```
+
+**Example usage:**
+```bash
+./frontend_study -d 10 -i 1000000 -b 32 -n 256 -s 64 -f 1024 -r 0
+```
+
+### 2. instr_throughput
+
+**File:** `instr_throughput.c`
+
+**Purpose:** Measure instruction fetch and decode throughput across varying code size ranges.
+
+**What it measures:**
+- Cycles and instructions for code execution
+- L1I cache misses
+- iTLB misses
+- L2 cache loads
+- DRAM reads (LL cache misses)
+- Bytes per cycle (throughput metric)
+
+**Key features:**
+- Tests 19 different code sizes from 1KB to 1MB
+- Dynamically generates executable code buffers filled with NOP instructions
+- Per-iteration metric collection
+- Auto-scales iterations based on code size (larger code = fewer iterations)
+
+**Typical test sizes:** 1K, 4K, 8K, 16K, 32K, 64K, 128K, 192K, 256K, 512K, 1M
+
+**Output:** Throughput metrics per iteration for each code size
+
+### 3. btb_estimate
+
+**File:** `btb_estimate.c`
+
+**Purpose:** Estimate Branch Target Buffer (BTB) capacity and behavior by measuring branch prediction performance.
+
+**What it measures:**
+- Branch misses
+- Instructions executed
+- Misses per instruction, per iteration, per buffer entry
+
+**Key features:**
+- Tests with buffer sizes from 256 to 262,144 entries (13 sizes)
+- Buffer contains random 0s and 1s
+- Based on buffer value, executes 1023 or 1024 NOPs (single-bit difference)
+- Tracks when branch prediction fails
+- Helps determine BTB capacity on the processor
+
+**Logic:** Larger buffer sizes that cause more misses indicate exceeding BTB capacity.
+
+### 4. btb_estimate_calls
+
+**File:** `btb_estimate_calls.c`
+
+**Purpose:** Estimate BTB capacity using indirect function calls instead of conditional branches.
+
+**What it measures:**
+- Branch misses from call instructions
+- Instructions executed
+- Misses per iteration and per function pointer
+
+**Key features:**
+- Allocates 128 to 4096 function pointers
+- Each function pointer stored on a separate 64KB page
+- All functions are identical (1024 NOPs + return)
+- Measures branch prediction misses when calling through these pointers
+- Random offsets within pages prevent trivial prediction
+
+**Pages tested:** 128 to 4096 pages (21 sizes)
+
+**Output:** Branch miss metrics as function count increases (identifies BTB saturation point)
+
+### 5. fe_study_cuda
+
+**File:** `fe_study_cuda.cu`
+
+**Purpose:** Study instruction frontend behavior on NVIDIA GPUs.
+
+**What it measures:**
+- GPU cycles and instructions
+- L1I cache misses on GPU
+- Function-specific overhead measurement
+- GPU instruction issue patterns
+
+**Features:**
+- CUDA kernel compilation (sm_90 for x86-64, sm_100 for ARM)
+- NOP-based synthetic workloads
+- Profiles flush overhead for L1I cache
+
+**Target architecture:** NVIDIA GPUs
+
+## Supporting Files
+
+### utils.c / utils.h
+
+Provides common functionality for all benchmarks:
+- **Perf counter abstraction**: Wraps Linux `perf_event_open` syscall
+- **Counter sets**: iTLB, L1I, Branch, L2, DRAM reads
+- **CPU frequency detection**: Reads from `/sys/devices/system/cpu/` or `/proc/cpuinfo`
+- **Measurement results aggregation**: Structures for storing multi-counter data
+- **LCG random number generator**: Deterministic RNG (`my_rand()`)
+
+### run_benchmark.sh
+
+Test harness for batch execution:
+- Reads input configuration from a file
+- Executes benchmarks with multiple parameter combinations
+- Outputs results in CSV format
+
+### full_run_input.txt
+
+Sample configuration parameters for `frontend_study`:
+- Tests varying divisors (16-512), iterations (10M-100M)
+- Memory configurations: 1-512MB buffers with 64KB pages
+- Function sizes: 16-8192 NOPs
+
+**Example row:** `10000000,100000000,32,256,64,16,0`
+- Divisor=10M, Iterations=100M, Buffer=32MB, Buffers=256, Page=64KB, Function=16 NOPs, No random
+
+## Building
+
+```bash
+make
+```
+
+The Makefile creates these executables:
+1. `frontend_study` - CPU frontend study (gcc)
+2. `fe_study_cuda` - CUDA version (nvcc)
+3. `instr_throughput` - Throughput benchmark
+4. `btb_estimate` - BTB sizing (branches)
+5. `btb_estimate_calls` - BTB sizing (calls)
+
+Build configuration:
+- Uses `gcc` for C benchmarks with `-O2` optimization
+- Uses `nvcc` for CUDA benchmarks with architecture-specific targets
+- Supports both x86-64 and ARM64 architectures
+
+## Usage Examples
+
+**Individual benchmark:**
+```bash
+./frontend_study -d 10 -i 1000000 -b 32 -n 256 -s 64 -f 1024 -r 0
+```
+
+**Batch execution with script:**
+```bash
+./run_benchmark.sh full_run_input.txt > results.csv
+```
+
+**Run instruction throughput tests:**
+```bash
+./instr_throughput
+```
+
+**Estimate BTB capacity:**
+```bash
+./btb_estimate
+./btb_estimate_calls
+```
+
+## Measurement Focus Areas
+
+| Benchmark | Primary Focus | Architecture | Key Metrics |
+|-----------|--------------|--------------|-------------|
+| frontend_study | Frontend/cache behavior | x86-64, ARM64 | iTLB, L1I, L2, Branch misses |
+| instr_throughput | Code size scaling | x86-64, ARM64 | Throughput, cache misses |
+| btb_estimate | BTB capacity (branches) | x86-64, ARM64 | Branch misses vs buffer size |
+| btb_estimate_calls | BTB capacity (calls) | x86-64, ARM64 | Call misses vs function count |
+| ARM BTB events | ARM64 | fe_study_cuda | GPU frontend | CUDA | GPU-specific patterns |
+
+## Performance Counter Requirements
+
+These benchmarks require access to Linux perf events. You may need to adjust perf_event_paranoid settings:
+
+```bash
+# Check current setting
+cat /proc/sys/kernel/perf_event_paranoid
+
+# Allow user access to performance counters (may require sudo)
+echo 1 | sudo tee /proc/sys/kernel/perf_event_paranoid
+```
+
+## Use Cases
+
+This benchmark suite is useful for:
+- Characterizing CPU microarchitecture behavior
+- Identifying performance bottlenecks in instruction fetch and decode
+- Understanding cache hierarchy characteristics
+- Measuring branch prediction capabilities and BTB capacity
+- Comparing performance across different processor generations
+- GPU instruction frontend analysis
diff --git a/uarch_bench/btb_estimate.c b/uarch_bench/btb_estimate.c
new file mode 100644
index 00000000..502e3fef
--- /dev/null
+++ b/uarch_bench/btb_estimate.c
@@ -0,0 +1,164 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include "utils.h"
+
+// Architecture-specific NOPs - both use 4-byte instructions
+#if defined(__aarch64__)
+#define ARCH_NOOP __asm__ __volatile__("nop\n\t"); // ARM64 NOP (4 bytes)
+#elif defined(__x86_64__)
+// x86-64 4-byte NOP: 0F 1F 40 00 (NOP DWORD PTR [RAX+0])
+#define ARCH_NOOP __asm__ __volatile__(".byte 0x0F, 0x1F, 0x40, 0x00\n\t");
+#else
+#error "Unsupported architecture"
+#endif
+
+// Build up NOPs hierarchically
+#define ARCH_NOOP_2 ARCH_NOOP ARCH_NOOP
+#define ARCH_NOOP_4 ARCH_NOOP_2 ARCH_NOOP_2
+#define ARCH_NOOP_8 ARCH_NOOP_4 ARCH_NOOP_4
+#define ARCH_NOOP_16 ARCH_NOOP_8 ARCH_NOOP_8
+#define ARCH_NOOP_32 ARCH_NOOP_16 ARCH_NOOP_16
+#define ARCH_NOOP_64 ARCH_NOOP_32 ARCH_NOOP_32
+#define ARCH_NOOP_128 ARCH_NOOP_64 ARCH_NOOP_64
+#define ARCH_NOOP_256 ARCH_NOOP_128 ARCH_NOOP_128
+#define ARCH_NOOP_512 ARCH_NOOP_256 ARCH_NOOP_256
+#define ARCH_NOOP_1024 ARCH_NOOP_512 ARCH_NOOP_512
+
+// 1023 = 512 + 256 + 128 + 64 + 32 + 16 + 8 + 4 + 2 + 1
+#define ARCH_NOOP_1023                                                \
+  ARCH_NOOP_512 ARCH_NOOP_256 ARCH_NOOP_128 ARCH_NOOP_64 ARCH_NOOP_32 \
+      ARCH_NOOP_16 ARCH_NOOP_8 ARCH_NOOP_4 ARCH_NOOP_2 ARCH_NOOP
+
+// Function that processes the buffer
+long long
+process_buffer(int* buffer, int size, int iterations, PerfCounters* perf) {
+  long long sum = 0;
+
+  // Warm-up: traverse 10 times and compute sum
+  for (int iter = 0; iter < 10; iter++) {
+    for (int i = 0; i < size; i++) {
+      sum += buffer[i];
+    }
+  }
+
+  // Start measuring
+  perf_counters_enable(perf);
+
+  // Main measurement loop
+  for (int iter = 0; iter < iterations; iter++) {
+    for (int i = 0; i < size; i++) {
+      if (buffer[i] == 0) {
+        // Execute 1024 NOPs
+        ARCH_NOOP_1024
+      } else {
+        // Execute 1023 NOPs
+        ARCH_NOOP_1023
+      }
+    }
+  }
+
+  // Stop measuring
+  perf_counters_disable_and_read(perf);
+
+  return sum / 10;
+}
+
+int main() {
+  int sizes[] = {
+      256,
+      512,
+      1024,
+      4096,
+      6144,
+      8192,
+      10240,
+      12288,
+      16384,
+      32768,
+      65536,
+      131072,
+      262144};
+  int iterations[] = {
+      100000,
+      100000,
+      10000,
+      10000,
+      10000,
+      1000,
+      10000,
+      1000,
+      1000,
+      1000,
+      500,
+      500,
+      500};
+  int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
+
+  // Initialize perf counters
+  PerfCounters perf;
+  if (perf_counters_init(&perf, COUNTER_SET_BRANCH, 0) != 0) {
+    fprintf(stderr, "Failed to initialize performance counters\n");
+    return 1;
+  }
+
+  // Seed random number generator
+  srand(42699642);
+
+  printf("BTB Size Estimation\n");
+  printf("===================\n\n");
+  printf(
+      "%-10s %-12s %-15s %-15s %-15s %-15s %-15s\n",
+      "Size",
+      "Warmup Sum",
+      "Instructions",
+      "Branch Misses",
+      "Misses/Instr",
+      "Misses/Iter",
+      "Misses/Buffer Size");
+  printf(
+      "---------------------------------------------------------------------------------------------\n");
+
+  for (int i = 0; i < num_sizes; i++) {
+    int size = sizes[i];
+
+    // Allocate and initialize buffer with random 0s and 1s
+    int* buffer = (int*)malloc(size * sizeof(int));
+    if (!buffer) {
+      fprintf(stderr, "Memory allocation failed\n");
+      perf_counters_cleanup(&perf);
+      return 1;
+    }
+
+    for (int j = 0; j < size; j++) {
+      buffer[j] = rand() % 2; // Random 0 or 1
+    }
+
+    // Process buffer and measure
+    long long warmup_sum = process_buffer(buffer, size, iterations[i], &perf);
+
+    double misses_per_instruction =
+        (double)perf.count_extra / perf.count_instructions;
+    double misses_per_iteration = (double)perf.count_extra / iterations[i];
+    double misses_per_buffer_size =
+        (double)perf.count_extra / iterations[i] / size;
+
+    printf(
+        "%-10d %-12lld %-15lld %-15lld %-15.6f %-15.2f %-15.2f\n",
+        size,
+        warmup_sum,
+        perf.count_instructions,
+        perf.count_extra,
+        misses_per_instruction,
+        misses_per_iteration,
+        misses_per_buffer_size);
+
+    free(buffer);
+  }
+
+  perf_counters_cleanup(&perf);
+
+  return 0;
+}
diff --git a/uarch_bench/btb_estimate_calls.c b/uarch_bench/btb_estimate_calls.c
new file mode 100644
index 00000000..118d8d47
--- /dev/null
+++ b/uarch_bench/btb_estimate_calls.c
@@ -0,0 +1,204 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <unistd.h>
+#include "utils.h"
+
+// Architecture-specific NOPs - both use 4-byte instructions
+#if defined(__aarch64__)
+#define ARCH_NOOP __asm__ __volatile__("nop\n\t"); // ARM64 NOP (4 bytes)
+#elif defined(__x86_64__)
+// x86-64 4-byte NOP: 0F 1F 40 00 (NOP DWORD PTR [RAX+0])
+#define ARCH_NOOP __asm__ __volatile__(".byte 0x0F, 0x1F, 0x40, 0x00\n\t");
+#else
+#error "Unsupported architecture"
+#endif
+
+// Build up NOPs hierarchically
+#define ARCH_NOOP_2 ARCH_NOOP ARCH_NOOP
+#define ARCH_NOOP_4 ARCH_NOOP_2 ARCH_NOOP_2
+#define ARCH_NOOP_8 ARCH_NOOP_4 ARCH_NOOP_4
+#define ARCH_NOOP_16 ARCH_NOOP_8 ARCH_NOOP_8
+#define ARCH_NOOP_32 ARCH_NOOP_16 ARCH_NOOP_16
+#define ARCH_NOOP_64 ARCH_NOOP_32 ARCH_NOOP_32
+#define ARCH_NOOP_128 ARCH_NOOP_64 ARCH_NOOP_64
+#define ARCH_NOOP_256 ARCH_NOOP_128 ARCH_NOOP_128
+#define ARCH_NOOP_512 ARCH_NOOP_256 ARCH_NOOP_256
+#define ARCH_NOOP_1024 ARCH_NOOP_512 ARCH_NOOP_512
+
+// 1023 = 512 + 256 + 128 + 64 + 32 + 16 + 8 + 4 + 2 + 1
+#define ARCH_NOOP_1023                                                \
+  ARCH_NOOP_512 ARCH_NOOP_256 ARCH_NOOP_128 ARCH_NOOP_64 ARCH_NOOP_32 \
+      ARCH_NOOP_16 ARCH_NOOP_8 ARCH_NOOP_4 ARCH_NOOP_2 ARCH_NOOP
+
+// Template function that executes 1024 NOPs and returns
+__attribute__((noinline)) void nop_function_1024() {
+  ARCH_NOOP_1024
+}
+
+// Marker function to calculate size
+__attribute__((noinline)) void nop_function_end() {}
+
+// Function that processes the buffer of function pointers
+void process_buffer(
+    void (**buffer)(),
+    int size,
+    int iterations,
+    PerfCounters* perf) {
+  // Warm-up: traverse 10 times and call functions
+  for (int iter = 0; iter < 10; iter++) {
+    for (int i = 0; i < size; i++) {
+      buffer[i]();
+    }
+  }
+
+  // Start measuring
+  perf_counters_enable(perf);
+
+  // Main measurement loop - call function pointers
+  for (int iter = 0; iter < iterations; iter++) {
+    for (int i = 0; i < size; i++) {
+      buffer[i]();
+    }
+  }
+
+  // Stop measuring
+  perf_counters_disable_and_read(perf);
+}
+
+int main() {
+  int sizes[] = {128, 192, 256, 320,  384,  448,  512,  576,  640,  704, 768,
+                 832, 896, 960, 1024, 1536, 2048, 2560, 3072, 3584, 4096};
+  int iterations[] = {10000, 10000, 10000, 10000, 10000, 10000, 10000,
+                      10000, 10000, 10000, 10000, 10000, 10000, 10000,
+                      10000, 10000, 10000, 10000, 10000, 10000, 10000};
+  int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
+  const size_t PAGE_SIZE = 64 * 1024; // 64KB pages
+  const size_t ALIGNMENT = 16; // Function alignment
+
+  // Calculate function size
+  size_t function_size = (char*)nop_function_end - (char*)nop_function_1024;
+  printf("Function size: %zu bytes\n", function_size);
+
+  if (function_size > PAGE_SIZE) {
+    fprintf(stderr, "Error: Function size exceeds page size\n");
+    return 1;
+  }
+
+  // Initialize perf counters
+  PerfCounters perf;
+  if (perf_counters_init(&perf, COUNTER_SET_BRANCH, 0) != 0) {
+    fprintf(stderr, "Failed to initialize performance counters\n");
+    return 1;
+  }
+
+  // Seed random number generator
+  srand(time(NULL));
+
+  printf("BTB Size Estimation (Function Calls)\n");
+  printf("=====================================\n\n");
+  printf(
+      "%-10s %-10s %-15s %-15s %-15s %-15s\n",
+      "Size",
+      "Pages",
+      "Instructions",
+      "Branch Misses",
+      "Misses/Iter",
+      "Misses/Buffer Size");
+  printf(
+      "------------------------------------------------------------------------------------------------------------\n");
+
+  for (int i = 0; i < num_sizes; i++) {
+    int size = sizes[i];
+
+    // Calculate number of pages needed (one function per page)
+    int num_pages = size;
+
+    // Allocate array to hold memory regions
+    void** pages = malloc(num_pages * sizeof(void*));
+    if (!pages) {
+      fprintf(stderr, "Failed to allocate pages array\n");
+      perf_counters_cleanup(&perf);
+      return 1;
+    }
+
+    // Allocate function pointer buffer
+    void (**buffer)() = malloc(size * sizeof(void (*)()));
+    if (!buffer) {
+      fprintf(stderr, "Failed to allocate buffer\n");
+      free(pages);
+      perf_counters_cleanup(&perf);
+      return 1;
+    }
+
+    // Allocate executable memory pages and copy functions
+    for (int j = 0; j < num_pages; j++) {
+      // Allocate executable page
+      pages[j] = mmap(
+          NULL,
+          PAGE_SIZE,
+          PROT_READ | PROT_WRITE | PROT_EXEC,
+          MAP_PRIVATE | MAP_ANONYMOUS,
+          -1,
+          0);
+
+      if (pages[j] == MAP_FAILED) {
+        fprintf(
+            stderr,
+            "Failed to allocate executable page %d for size %d\n",
+            j,
+            size);
+        // Cleanup previously allocated pages
+        for (int k = 0; k < j; k++) {
+          munmap(pages[k], PAGE_SIZE);
+        }
+        free(buffer);
+        free(pages);
+        perf_counters_cleanup(&perf);
+        return 1;
+      }
+
+      // Calculate random offset within page (aligned)
+      size_t max_offset = PAGE_SIZE - function_size;
+      size_t offset = (rand() % (max_offset / ALIGNMENT)) * ALIGNMENT;
+
+      // Copy function to this offset
+      void* dest = (char*)pages[j] + offset;
+      memcpy(dest, (void*)nop_function_1024, function_size);
+
+      // Store function pointer in buffer
+      buffer[j] = (void (*)())dest;
+    }
+
+    // Process buffer and measure
+    process_buffer(buffer, size, iterations[i], &perf);
+
+    double misses_per_iteration = (double)perf.count_extra / iterations[i];
+    double misses_per_buffer_size =
+        (double)perf.count_extra / iterations[i] / size;
+
+    printf(
+        "%-10d %-10d %-15lld %-15lld %-15.2f %-15.6f\n",
+        size,
+        num_pages,
+        perf.count_instructions,
+        perf.count_extra,
+        misses_per_iteration,
+        misses_per_buffer_size);
+
+    // Cleanup
+    for (int j = 0; j < num_pages; j++) {
+      munmap(pages[j], PAGE_SIZE);
+    }
+    free(buffer);
+    free(pages);
+  }
+
+  perf_counters_cleanup(&perf);
+
+  return 0;
+}
diff --git a/uarch_bench/fe_study_cuda.cu b/uarch_bench/fe_study_cuda.cu
new file mode 100644
index 00000000..0516967c
--- /dev/null
+++ b/uarch_bench/fe_study_cuda.cu
@@ -0,0 +1,316 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include <cassert>
+#include <chrono>
+#include <cstring>
+#include <iostream>
+#include <vector>
+
+#include <stdio.h>
+
+extern "C" {
+#include "utils.h"
+}
+
+__global__ void emptyKernel(const int n, float* __restrict__ gOutput) {}
+
+// Macros for unrolled NOP instructions (similar to frontend_study.c)
+//
+// AArch64 Instruction Encoding:
+// All AArch64 instructions are fixed-length 32-bit (4 bytes) encodings.
+// Reference: ARM Architecture Reference Manual ARMv8, Section C3.1 "A64
+// Instruction Set Encoding"
+// https://developer.arm.com/documentation/ddi0487/latest (ARM ARM for ARMv8-A)
+//
+// Calculation: 2MB = 2,097,152 bytes / 4 bytes per instruction = 524,288
+// instructions
+#define NOOP __asm__ __volatile__("nop\n\t");
+#define NOOP_4 NOOP NOOP NOOP NOOP
+#define NOOP_16 NOOP_4 NOOP_4 NOOP_4 NOOP_4
+#define NOOP_64 NOOP_16 NOOP_16 NOOP_16 NOOP_16
+#define NOOP_256 NOOP_64 NOOP_64 NOOP_64 NOOP_64
+#define NOOP_1024 NOOP_256 NOOP_256 NOOP_256 NOOP_256
+#define NOOP_4096 NOOP_1024 NOOP_1024 NOOP_1024 NOOP_1024
+#define NOOP_16384 NOOP_4096 NOOP_4096 NOOP_4096 NOOP_4096
+#define NOOP_65536 NOOP_16384 NOOP_16384 NOOP_16384 NOOP_16384
+#define NOOP_262144 NOOP_65536 NOOP_65536 NOOP_65536 NOOP_65536
+#define NOOP_524288 NOOP_262144 NOOP_262144
+
+// Function to flush instruction cache using NOP instructions
+
+__attribute__((noinline)) void flush_instruction_l1_cache_noop() {
+#if defined(__aarch64__)
+  // ARM/AArch64: 16,384 NOPs (64KB)
+  NOOP_16384
+#elif defined(__x86_64__) || defined(__i386__)
+  // x86: 24K noops is 24KB
+  NOOP_16384
+  NOOP_4096
+  NOOP_4096
+#else
+  // Fallback: default to 16,384 NOPs
+  NOOP_16384
+#endif
+}
+
+__attribute__((noinline)) void flush_for_flush_instruction_l1_cache_noop() {
+#if defined(__aarch64__)
+  // ARM/AArch64: 16,384 NOPs (64KB)
+  NOOP_16384
+#elif defined(__x86_64__) || defined(__i386__)
+  // x86: 8,192 NOPs (32KB)
+  NOOP_4096
+  NOOP_4096
+#else
+  // Fallback: default to 16,384 NOPs
+  NOOP_16384
+#endif
+}
+
+// Structure to hold profiled flush overhead metrics
+struct FlushOverhead {
+  double cycles_per_call;
+  double instructions_per_call;
+  double l1i_misses_per_call;
+};
+
+// Profile the flush_instruction_cache function to measure its overhead
+FlushOverhead profile_flush_overhead(CounterSet counter_set) {
+  FlushOverhead overhead = {0.0, 0.0, 0.0};
+
+  // Initialize perf counters for the specified counter set
+  PerfCounters perf;
+  int perf_available = (perf_counters_init(&perf, counter_set, 0) == 0);
+
+  if (!perf_available) {
+    printf("Warning: Performance counters not available for flush profiling\n");
+    return overhead;
+  }
+
+  // Profile provided flush function
+  printf("Profiling flush function overhead ...\n");
+
+  // Flush the L1I cache using a different function
+  flush_for_flush_instruction_l1_cache_noop();
+  perf_counters_enable(&perf);
+  flush_instruction_l1_cache_noop();
+  perf_counters_disable_and_read(&perf);
+
+  // Calculate per-call overhead
+  overhead.cycles_per_call = (double)perf.count_cycles;
+  overhead.instructions_per_call = (double)perf.count_instructions;
+  overhead.l1i_misses_per_call = (double)perf.count_extra;
+
+  printf(
+      "Flush overhead per call: Cycles: %.2f, Instructions: %.2f, Counter: %.2f\n",
+      overhead.cycles_per_call,
+      overhead.instructions_per_call,
+      overhead.l1i_misses_per_call);
+
+  perf_counters_cleanup(&perf);
+  return overhead;
+}
+
+std::vector<double> timeLaunch(
+    const int numReps,
+    cudaStream_t stream,
+    const std::vector<int>& gridSizes,
+    const FlushOverhead& flush_overhead,
+    CounterSet counter_set) {
+  std::vector<double> timeUs;
+
+  // Initialize perf counters for the specified counter set
+  PerfCounters perf;
+  int perf_available = (perf_counters_init(&perf, counter_set, 0) == 0);
+  if (!perf_available) {
+    printf(
+        "Warning: Performance counters not available for launch profiling\n");
+    return timeUs;
+  }
+
+  for (const auto& numBlocks : gridSizes) {
+    dim3 block(256);
+    dim3 grid(numBlocks);
+    cudaDeviceSynchronize();
+
+    // Reset and enable perf counters before measurement
+    perf_counters_enable(&perf);
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < numReps; ++i) {
+      flush_instruction_l1_cache_noop();
+      emptyKernel<<<grid, block, 0, stream>>>(numBlocks, nullptr);
+    }
+
+    // Disable and read perf counters after measurement
+    perf_counters_disable_and_read(&perf);
+    cudaDeviceSynchronize();
+
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double, std::micro> elapsed = end - start;
+    timeUs.push_back(elapsed.count() / numReps);
+
+    // Calculate per-launch metrics (for emptyKernel only)
+    double instructions_per_launch = perf.count_instructions / (double)numReps -
+        (double)flush_overhead.instructions_per_call;
+    double counter_per_launch = (double)perf.count_extra / (double)numReps -
+        (double)flush_overhead.l1i_misses_per_call;
+    double cycles_per_launch = (double)perf.count_cycles / (double)numReps;
+    double ipc = (perf.count_cycles > 0)
+        ? ((double)perf.count_instructions / (double)perf.count_cycles)
+        : 0.0;
+
+    flush_instruction_l1_cache_noop();
+    flush_instruction_l1_cache_noop();
+    flush_instruction_l1_cache_noop();
+    perf_counters_enable(&perf);
+    emptyKernel<<<grid, block, 0, stream>>>(numBlocks, nullptr);
+    perf_counters_disable_and_read(&perf);
+
+    double instructions_per_launch_single = (double)perf.count_instructions;
+    double counter_per_launch_single = (double)perf.count_extra;
+    double cycles_per_launch_single = (double)perf.count_cycles;
+    double ipc_single = (perf.count_cycles > 0)
+        ? ((double)perf.count_instructions / (double)perf.count_cycles)
+        : 0.0;
+
+    printf(
+        "  Grid %6d: Repeat-Run: Instructions: %.2f, Counter: %.2f, Cycles (including overhead): %.2f, IPC: %.4f\n",
+        numBlocks,
+        instructions_per_launch,
+        counter_per_launch,
+        cycles_per_launch,
+        ipc);
+
+    printf(
+        "  Grid %6d: Single-Run: Instructions: %.2f, Counter: %.2f, Cycles (excluding overhead): %.2f, IPC: %.4f\n",
+        numBlocks,
+        instructions_per_launch_single,
+        counter_per_launch_single,
+        cycles_per_launch_single,
+        ipc_single);
+  }
+
+  // Cleanup perf counters
+  if (perf_available) {
+    perf_counters_cleanup(&perf);
+  }
+
+  return timeUs;
+}
+
+int main(int argc, char* argv[]) {
+  int gpuIdx = 0;
+  int numReps = 10000;
+  int minBlocks = 1;
+  int maxBlocks = 128 * 1024;
+  bool warmUpLaunch = false;
+  int verbosityLevel = 1;
+  CounterSet counter_set = COUNTER_SET_L1I; // Default to L1I cache misses
+
+  if (argc >= 2) {
+    gpuIdx = atoi(argv[1]);
+  }
+  if (argc >= 3) {
+    numReps = atoi(argv[2]);
+  }
+  if (argc >= 5) {
+    minBlocks = atoi(argv[3]);
+    maxBlocks = atoi(argv[4]);
+  }
+  if (argc >= 6) {
+    warmUpLaunch = (atoi(argv[5]) == 1);
+  }
+  if (argc >= 7) {
+    // Parse counter set argument
+    const char* counter_arg = argv[6];
+    if (strcmp(counter_arg, "l1i") == 0 || strcmp(counter_arg, "L1I") == 0) {
+      counter_set = COUNTER_SET_L1I;
+      printf("Using L1 Instruction cache misses counter\n");
+    } else if (
+        strcmp(counter_arg, "itlb") == 0 || strcmp(counter_arg, "ITLB") == 0) {
+      counter_set = COUNTER_SET_ITLB;
+      printf("Using iTLB misses counter\n");
+    } else if (
+        strcmp(counter_arg, "l2") == 0 || strcmp(counter_arg, "L2") == 0) {
+      counter_set = COUNTER_SET_L2;
+      printf("Using L2 load instructions counter\n");
+    } else {
+      fprintf(
+          stderr,
+          "Error: Invalid counter type '%s'. Valid options: l1i, itlb, l2\n",
+          counter_arg);
+      fprintf(
+          stderr,
+          "Usage: %s [gpuIdx] [numReps] [minBlocks] [maxBlocks] [warmUpLaunch] [counter_type]\n",
+          argv[0]);
+      fprintf(
+          stderr,
+          "  counter_type: l1i (L1I cache misses), itlb (iTLB misses), l2 (L2 load instructions)\n");
+      return 1;
+    }
+  }
+
+  std::vector<int> gridSizes;
+  for (int i = minBlocks; i <= maxBlocks; i *= 2) {
+    gridSizes.push_back(i);
+  }
+
+  cudaSetDevice(gpuIdx);
+  cudaFree(0);
+
+  cudaStream_t stream = 0;
+  cudaStreamCreate(&stream);
+
+  // Warmp up launch, to get the code to GPU.
+  if (warmUpLaunch) {
+    emptyKernel<<<1, 1>>>(0, nullptr);
+  }
+
+  // Profile selected flush function overhead
+  // Calculate the size of the flush function
+  void* flush_func_ptr = (void*)flush_instruction_l1_cache_noop;
+  void* flush_for_flush_func_ptr =
+      (void*)flush_for_flush_instruction_l1_cache_noop;
+  size_t flush_func_size =
+      (size_t)((char*)flush_for_flush_func_ptr - (char*)flush_func_ptr);
+  printf(
+      "Size of flush_instruction_l1_cache_noop function: %zu bytes\n",
+      flush_func_size);
+
+  FlushOverhead flush_overhead = profile_flush_overhead(counter_set);
+
+  // Measure launch latency for null stream.
+  printf("\n=== Measuring null stream launch latencies ===\n");
+  auto nullStreamLaunchLatencies =
+      timeLaunch(numReps, 0, gridSizes, flush_overhead, counter_set);
+
+  // Measure launch latency for non-null stream.
+  printf("\n=== Measuring non-null stream launch latencies ===\n");
+  auto nonNullStreamLaunchLatencies =
+      timeLaunch(numReps, stream, gridSizes, flush_overhead, counter_set);
+
+  assert(
+      nullStreamLaunchLatencies.size() == nonNullStreamLaunchLatencies.size());
+  assert(nullStreamLaunchLatencies.size() == gridSizes.size());
+
+  if (verbosityLevel > 0) {
+    std::cout << gpuIdx << " " << numReps << " " << minBlocks << " "
+              << maxBlocks << " " << warmUpLaunch << std::endl;
+    printf("  CTAs   null non-null\n");
+    printf("----------------------\n");
+  }
+  for (long unsigned int i = 0; i < gridSizes.size(); ++i) {
+    printf(
+        "%6d %6.2f %6.2f\n",
+        gridSizes[i],
+        nullStreamLaunchLatencies[i],
+        nonNullStreamLaunchLatencies[i]);
+    // std::cout << gridSizes[i] << " " << nullStreamLaunchLatencies[i] << " "
+    //          << nonNullStreamLaunchLatencies[i] << std::endl;
+  }
+
+  cudaStreamDestroy(stream);
+
+  return 0;
+}
diff --git a/uarch_bench/frontend_study.c b/uarch_bench/frontend_study.c
new file mode 100644
index 00000000..602f06fa
--- /dev/null
+++ b/uarch_bench/frontend_study.c
@@ -0,0 +1,418 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <unistd.h>
+#include "utils.h"
+
+#define MISS1                 \
+  funcIndex = iter % divisor; \
+  iter++;                     \
+  functionPointers[funcIndex]();
+#define MISS4 MISS1 MISS1 MISS1 MISS1
+#define MISS16 MISS4 MISS4 MISS4 MISS4
+#define MISS64 MISS16 MISS16 MISS16 MISS16
+#define MISS256 MISS64 MISS64 MISS64 MISS64
+#define MISS1024 MISS256 MISS256 MISS256 MISS256
+
+#define RAND_MISS1                 \
+  funcIndex = my_rand() % divisor; \
+  iter++;                          \
+  functionPointers[funcIndex]();
+#define RAND_MISS4 RAND_MISS1 RAND_MISS1 RAND_MISS1 RAND_MISS1
+#define RAND_MISS16 RAND_MISS4 RAND_MISS4 RAND_MISS4 RAND_MISS4
+#define RAND_MISS64 RAND_MISS16 RAND_MISS16 RAND_MISS16 RAND_MISS16
+#define RAND_MISS256 RAND_MISS64 RAND_MISS64 RAND_MISS64 RAND_MISS64
+#define RAND_MISS1024 RAND_MISS256 RAND_MISS256 RAND_MISS256 RAND_MISS256
+
+// Architecture-specific NOPs - both use 4-byte instructions
+#if defined(__aarch64__)
+#define ARCH_NOOP __asm__ __volatile__("nop\n\t"); // ARM64 NOP (4 bytes)
+#elif defined(__x86_64__)
+// x86-64 4-byte NOP: 0F 1F 40 00 (NOP DWORD PTR [RAX+0])
+#define ARCH_NOOP __asm__ __volatile__(".byte 0x0F, 0x1F, 0x40, 0x00\n\t");
+#else
+#error "Unsupported architecture"
+#endif
+
+#define ARCH_NOOP_4 ARCH_NOOP ARCH_NOOP ARCH_NOOP ARCH_NOOP
+#define ARCH_NOOP_16 ARCH_NOOP_4 ARCH_NOOP_4 ARCH_NOOP_4 ARCH_NOOP_4
+#define ARCH_NOOP_64 ARCH_NOOP_16 ARCH_NOOP_16 ARCH_NOOP_16 ARCH_NOOP_16
+#define ARCH_NOOP_256 ARCH_NOOP_64 ARCH_NOOP_64 ARCH_NOOP_64 ARCH_NOOP_64
+#define ARCH_NOOP_1024 ARCH_NOOP_256 ARCH_NOOP_256 ARCH_NOOP_256 ARCH_NOOP_256
+#define ARCH_NOOP_4096 \
+  ARCH_NOOP_1024 ARCH_NOOP_1024 ARCH_NOOP_1024 ARCH_NOOP_1024
+#define ARCH_NOOP_8192 ARCH_NOOP_4096 ARCH_NOOP_4096
+
+// Simple no-op functions of various sizes
+__attribute__((noinline)) void targetFunction16() {
+  ARCH_NOOP_16
+}
+
+__attribute__((noinline)) void targetFunction64() {
+  ARCH_NOOP_64
+}
+
+__attribute__((noinline)) void targetFunction256() {
+  ARCH_NOOP_256
+}
+
+__attribute__((noinline)) void targetFunction1024() {
+  ARCH_NOOP_1024
+}
+
+__attribute__((noinline)) void targetFunction4096() {
+  ARCH_NOOP_4096
+}
+
+__attribute__((noinline)) void targetFunction8192() {
+  ARCH_NOOP_8192
+}
+
+// Marker function for size estimation
+__attribute__((noinline)) void targetFunctionEnd() {}
+
+// Run the main measurement loop
+void run_measurement_loop(
+    void (**functionPointers)(void),
+    int divisor,
+    unsigned long long iterations,
+    int use_random_jumps) {
+  size_t funcIndex = 0;
+  unsigned long long iter = 0;
+
+  if (use_random_jumps) {
+    while (iter < iterations) {
+      RAND_MISS1024
+    }
+  } else {
+    while (iter < iterations) {
+      MISS1024
+    }
+  }
+}
+
+// Print usage information
+void print_usage(const char* program_name) {
+  fprintf(
+      stderr,
+      "Usage: %s -d <divisor> -i <iterations> -b <buffer_size_MB> -n <num_buffers> -s <page_KB> -f <func nops 16/64/256/1024/4096/8192> -r <random_jumps>\n",
+      program_name);
+}
+
+int main(int argc, char* argv[]) {
+  // Initialize variables with invalid values to detect missing arguments
+  int divisor = -1;
+  unsigned long long iterations = 0;
+  int buffer_size_mb = -1;
+  int num_buffers = -1;
+  int page_kb = -1;
+  int use_random_jumps = -1;
+  int func_nops = -1;
+  const unsigned long code_alignment = 16;
+
+  // Parse command line arguments
+  int opt;
+  while ((opt = getopt(argc, argv, "d:i:b:n:s:f:r:h")) != -1) {
+    switch (opt) {
+      case 'd':
+        divisor = atoi(optarg);
+        break;
+      case 'i':
+        iterations = atoll(optarg);
+        break;
+      case 'b':
+        buffer_size_mb = atoi(optarg);
+        break;
+      case 'n':
+        num_buffers = atoi(optarg);
+        break;
+      case 's':
+        page_kb = atoi(optarg);
+        break;
+      case 'f':
+        func_nops = atoi(optarg);
+        break;
+      case 'r':
+        use_random_jumps = atoi(optarg);
+        break;
+      case 'h':
+        print_usage(argv[0]);
+        return 0;
+      default:
+        print_usage(argv[0]);
+        return 1;
+    }
+  }
+
+  // Validate all required arguments are present
+  if (divisor == -1 || iterations == 0 || buffer_size_mb == -1 ||
+      num_buffers == -1 || page_kb == -1 || use_random_jumps == -1 ||
+      func_nops == -1) {
+    fprintf(stderr, "Error: All arguments are required.\n\n");
+    print_usage(argv[0]);
+    return 1;
+  }
+
+  // Validate argument values
+  if (divisor <= 0) {
+    fprintf(stderr, "Error: Divisor must be a positive integer\n");
+    return 1;
+  }
+
+  if (iterations <= 0) {
+    fprintf(stderr, "Error: Iterations must be a positive integer\n");
+    return 1;
+  }
+
+  if (buffer_size_mb <= 0) {
+    fprintf(stderr, "Error: Buffer size (MB) must be a positive integer\n");
+    return 1;
+  }
+
+  if (num_buffers <= 0) {
+    fprintf(stderr, "Error: Number of buffers must be a positive integer\n");
+    return 1;
+  }
+
+  if (page_kb <= 0) {
+    fprintf(stderr, "Error: Page (KB) must be a positive integer\n");
+    return 1;
+  }
+
+  if (func_nops != 16 && func_nops != 64 && func_nops != 256 &&
+      func_nops != 1024 && func_nops != 4096 && func_nops != 8192) {
+    fprintf(stderr, "Error: Function NOPs must be 16/64/256/1024/4096/8192\n");
+    return 1;
+  }
+
+  if (use_random_jumps != 0 && use_random_jumps != 1) {
+    fprintf(stderr, "Error: Random jumps must be 0 or 1\n");
+    return 1;
+  }
+
+  // Select function to use based on the requested NOP size
+  size_t functionSize = 0;
+  void (*targetFunction)(void) = ({
+    void (*fn)(void) = NULL;
+    switch (func_nops) {
+      case 16:
+        fn = targetFunction16;
+        functionSize = (char*)targetFunction64 - (char*)targetFunction16;
+        break;
+      case 64:
+        fn = targetFunction64;
+        functionSize = (char*)targetFunction256 - (char*)targetFunction64;
+        break;
+      case 256:
+        fn = targetFunction256;
+        functionSize = (char*)targetFunction1024 - (char*)targetFunction256;
+        break;
+      case 1024:
+        fn = targetFunction1024;
+        functionSize = (char*)targetFunction4096 - (char*)targetFunction1024;
+        break;
+      case 4096:
+        fn = targetFunction4096;
+        functionSize = (char*)targetFunction8192 - (char*)targetFunction4096;
+        break;
+      case 8192:
+        fn = targetFunction8192;
+        functionSize = (char*)targetFunctionEnd - (char*)targetFunction8192;
+        break;
+      default:
+        // Fallback to the smallest function if an invalid size is somehow
+        // passed
+        fn = targetFunction16;
+        functionSize = (char*)targetFunction64 - (char*)targetFunction16;
+        break;
+    }
+    fn;
+  });
+
+  // Calculate derived values
+  unsigned long page_bytes = (unsigned long long)page_kb * 1024;
+  unsigned long long num_copies = (unsigned long long)num_buffers *
+      (unsigned long long)buffer_size_mb * 1024 / (unsigned long long)page_kb;
+  int num_regions = num_buffers;
+
+  printf("Using divisor: %d\n", divisor);
+  printf("Using iterations: %llu\n", iterations);
+  printf(
+      "Allocating %d regions of %d MB each (total %d MB)...\n",
+      num_regions,
+      buffer_size_mb,
+      num_buffers * buffer_size_mb);
+  printf("Page: %d KB\n", page_kb);
+  printf("Number of function copies: %llu\n", num_copies);
+  printf("Estimated function size: %zu bytes\n", functionSize);
+
+  if (num_copies < (unsigned long long)divisor) {
+    fprintf(stderr, "Warning: setting 'divisor' to the num_copies value.\n");
+    divisor = num_copies;
+  }
+
+  // Allocate array to hold pointers to each region
+  void** regions = malloc(num_regions * sizeof(void*));
+  if (!regions) {
+    fprintf(stderr, "Failed to allocate regions array\n");
+    return 1;
+  }
+
+  // Map each region separately
+  const unsigned long long REGION_SIZE =
+      (unsigned long long)buffer_size_mb * 1024 * 1024;
+  printf("Mapping %d memory regions...\n", num_regions);
+  for (int i = 0; i < num_regions; i++) {
+    regions[i] = mmap(
+        NULL,
+        REGION_SIZE,
+        PROT_READ | PROT_WRITE | PROT_EXEC,
+        MAP_PRIVATE | MAP_ANONYMOUS,
+        -1,
+        0);
+
+    if (regions[i] == MAP_FAILED) {
+      fprintf(stderr, "Failed to allocate memory region %d\n", i);
+      // Cleanup already allocated regions
+      for (int j = 0; j < i; j++) {
+        munmap(regions[j], REGION_SIZE);
+      }
+      free(regions);
+      return 1;
+    }
+  }
+  printf("Successfully mapped %d regions\n", num_regions);
+
+  // Allocate array to store function pointers
+  void (**functionPointers)(void) = malloc(num_copies * sizeof(void (*)(void)));
+  if (!functionPointers) {
+    fprintf(stderr, "Failed to allocate function pointer array\n");
+    for (int i = 0; i < num_regions; i++) {
+      munmap(regions[i], REGION_SIZE);
+    }
+    free(regions);
+    return 1;
+  }
+
+  // Copy function code to regions with page
+  printf("Copying function code...\n");
+  for (unsigned long long i = 0; i < num_copies; i++) {
+    unsigned long intra_page_offset = my_rand() %
+        ((page_bytes - functionSize - 1) / code_alignment) * code_alignment;
+
+    // Calculate total byte offset
+    unsigned long long total_offset = (i * page_bytes) + intra_page_offset;
+
+    // Determine which region and offset within that region
+    int region_idx = total_offset / REGION_SIZE;
+    unsigned long long region_offset = total_offset % REGION_SIZE;
+
+    // Calculate destination address
+    void* dest = (char*)regions[region_idx] + region_offset;
+
+    memcpy(dest, (void*)targetFunction, functionSize);
+    functionPointers[i] = (void (*)(void))dest;
+  }
+  printf("Function copies created: %llu\n", num_copies);
+
+  // Get CPU frequency (just once)
+  unsigned long long cpu_freq_hz = get_cpu_frequency_hz();
+  printf("CPU Frequency: %.2f GHz\n\n", cpu_freq_hz / 1e9);
+
+  // Structure to hold all measurement results
+  MeasurementResults results;
+  memset(&results, 0, sizeof(MeasurementResults));
+
+  // ========== Measurement 0: Timing only (no profiling) ==========
+  printf("=== Starting Measurement 0: Timing only (no profiling) ===\n");
+  struct timespec start_time, end_time;
+  clock_gettime(CLOCK_MONOTONIC, &start_time);
+  run_measurement_loop(functionPointers, divisor, iterations, use_random_jumps);
+  clock_gettime(CLOCK_MONOTONIC, &end_time);
+
+  // Calculate elapsed time in seconds
+  double elapsed_sec = (end_time.tv_sec - start_time.tv_sec) +
+      (end_time.tv_nsec - start_time.tv_nsec) / 1e9;
+  double iterations_per_sec = iterations / elapsed_sec;
+
+  printf("Elapsed time: %.6f seconds\n", elapsed_sec);
+  printf("Iterations per second: %.2f M/s\n", iterations_per_sec / 1e6);
+  printf("Completed timing measurement\n\n");
+
+  // ========== Measurement 1: iTLB misses ==========
+  printf("=== Starting Measurement 1: iTLB misses ===\n");
+  PerfCounters perf_itlb;
+  if (perf_counters_init(&perf_itlb, COUNTER_SET_ITLB, 0) == 0) {
+    perf_counters_enable(&perf_itlb);
+    run_measurement_loop(
+        functionPointers, divisor, iterations, use_random_jumps);
+    perf_counters_disable_and_read(&perf_itlb);
+    results.cycles_itlb = perf_itlb.count_cycles;
+    results.instructions_itlb = perf_itlb.count_instructions;
+    results.itlb_misses = perf_itlb.count_extra;
+    perf_counters_cleanup(&perf_itlb);
+    printf("Completed iTLB measurement\n\n");
+  }
+
+  // ========== Measurement 2: L1I cache misses ==========
+  printf("=== Starting Measurement 2: L1I cache misses ===\n");
+  PerfCounters perf_l1i;
+  if (perf_counters_init(&perf_l1i, COUNTER_SET_L1I, 0) == 0) {
+    perf_counters_enable(&perf_l1i);
+    run_measurement_loop(
+        functionPointers, divisor, iterations, use_random_jumps);
+    perf_counters_disable_and_read(&perf_l1i);
+    results.cycles_l1i = perf_l1i.count_cycles;
+    results.instructions_l1i = perf_l1i.count_instructions;
+    results.l1i_misses = perf_l1i.count_extra;
+    perf_counters_cleanup(&perf_l1i);
+    printf("Completed L1I measurement\n\n");
+  }
+
+  // ========== Measurement 3: Branch misses ==========
+  printf("=== Starting Measurement 3: Branch misses ===\n");
+  PerfCounters perf_branch;
+  if (perf_counters_init(&perf_branch, COUNTER_SET_BRANCH, 0) == 0) {
+    perf_counters_enable(&perf_branch);
+    run_measurement_loop(
+        functionPointers, divisor, iterations, use_random_jumps);
+    perf_counters_disable_and_read(&perf_branch);
+    results.cycles_branch = perf_branch.count_cycles;
+    results.instructions_branch = perf_branch.count_instructions;
+    results.branch_misses = perf_branch.count_extra;
+    perf_counters_cleanup(&perf_branch);
+    printf("Completed Branch measurement\n\n");
+  }
+
+  // ========== Measurement 4: L2 cache loads ==========
+  printf("=== Starting Measurement 4: L2 cache loads ===\n");
+  PerfCounters perf_l2;
+  if (perf_counters_init(&perf_l2, COUNTER_SET_L2, 0) == 0) {
+    perf_counters_enable(&perf_l2);
+    run_measurement_loop(
+        functionPointers, divisor, iterations, use_random_jumps);
+    perf_counters_disable_and_read(&perf_l2);
+    results.cycles_l2 = perf_l2.count_cycles;
+    results.instructions_l2 = perf_l2.count_instructions;
+    results.l2_loads = perf_l2.count_extra;
+    perf_counters_cleanup(&perf_l2);
+    printf("Completed L2 measurement\n\n");
+  }
+
+  // Print all results
+  print_measurement_results(&results, iterations);
+
+  // Cleanup
+  free(functionPointers);
+  for (int i = 0; i < num_regions; i++) {
+    munmap(regions[i], REGION_SIZE);
+  }
+  free(regions);
+
+  return 0;
+}
diff --git a/uarch_bench/full_run_input.txt b/uarch_bench/full_run_input.txt
new file mode 100644
index 00000000..8a518970
--- /dev/null
+++ b/uarch_bench/full_run_input.txt
@@ -0,0 +1,41 @@
+10000000,100000000,32,256,64,16,0
+10000000,100000000,32,256,64,64,0
+10000000,100000000,32,256,64,256,0
+10000000,100000000,32,256,64,1024,0
+10000000,10000000,32,256,64,4096,0
+10000000,10000000,32,256,64,8192,0
+10000000,100000000,16,128,64,16,0
+10000000,100000000,16,128,64,64,0
+10000000,100000000,16,128,64,256,0
+10000000,100000000,16,128,64,1024,0
+10000000,10000000,16,128,64,4096,0
+10000000,10000000,16,128,64,8192,0
+10000000,100000000,512,64,64,16,0
+10000000,100000000,512,64,64,64,0
+10000000,100000000,512,64,64,256,0
+10000000,100000000,512,64,64,1024,0
+10000000,10000000,512,64,64,4096,0
+10000000,10000000,512,64,64,8192,0
+10000000,10000000,16,64,64,16,0
+10000000,10000000,32,64,64,16,0
+10000000,10000000,64,64,64,16,0
+10000000,10000000,128,64,64,16,0
+10000000,10000000,256,64,64,16,0
+10000000,10000000,512,64,64,16,0
+10000000,10000000,1,20,1024,16,0
+10000000,10000000,1,25,1024,16,0
+10000000,10000000,1,30,1024,16,0
+10000000,10000000,1,32,1024,16,0
+10000000,10000000,1,36,1024,16,0
+10000000,10000000,1,40,1024,16,0
+10000000,10000000,1,42,1024,16,0
+10000000,10000000,1,44,1024,16,0
+10000000,10000000,1,48,1024,16,0
+10000000,10000000,1,64,1024,16,0
+10000000,10000000,1,128,1024,16,0
+10000000,10000000,1,256,1024,16,0
+10000000,10000000,1,512,1024,16,0
+10000000,10000000,1,1024,1024,16,0
+10000000,10000000,1,2048,1024,16,0
+10000000,10000000,1,4096,1024,16,0
+10000000,10000000,1,8192,1024,16,0
diff --git a/uarch_bench/instr_throughput.c b/uarch_bench/instr_throughput.c
new file mode 100644
index 00000000..3f4f66f3
--- /dev/null
+++ b/uarch_bench/instr_throughput.c
@@ -0,0 +1,420 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <unistd.h>
+#include "utils.h"
+
+// Architecture-specific code generation
+#if defined(__aarch64__)
+#define NOP_INSTRUCTION 0xD503201F // ARM64 NOP instruction
+#define RET_INSTRUCTION 0xD65F03C0 // ARM64 RET instruction
+#define INSTRUCTION_SIZE 4
+#elif defined(__x86_64__)
+// x86-64 4-byte NOP: 0F 1F 40 00 (NOP DWORD PTR [RAX+0])
+// Little-endian representation: bytes 0x0F, 0x1F, 0x40, 0x00 -> uint32_t
+// 0x00401F0F
+#define NOP_INSTRUCTION 0x00401F0F // x86-64 4-byte NOP instruction
+#define RET_INSTRUCTION 0xC3 // x86-64 RET instruction (1 byte)
+#define INSTRUCTION_SIZE 4
+#else
+#error "Unsupported architecture"
+#endif
+
+// Structure to hold an executable code buffer
+typedef struct {
+  void* buffer;
+  size_t size;
+  void (*func)(void);
+} CodeBuffer;
+
+// Structure to hold results for a single test
+typedef struct {
+  const char* size_name;
+  unsigned long size_kb;
+  unsigned long num_nops;
+  CodeBuffer* code;
+  long long cycles;
+  long long instructions;
+  long long l1i_misses;
+  long long itlb_misses;
+  long long l2_loads;
+  long long dram_reads;
+  double bytes_per_cycle;
+} TestResult;
+
+// Allocate executable memory buffer
+CodeBuffer* create_code_buffer(size_t size_kb) {
+  CodeBuffer* code = (CodeBuffer*)malloc(sizeof(CodeBuffer));
+  if (!code) {
+    fprintf(stderr, "Failed to allocate CodeBuffer structure\n");
+    return NULL;
+  }
+
+  size_t size_bytes = size_kb * 1024;
+  code->size = size_bytes;
+
+  // Allocate executable memory using mmap
+  code->buffer = mmap(
+      NULL,
+      size_bytes,
+      PROT_READ | PROT_WRITE | PROT_EXEC,
+      MAP_PRIVATE | MAP_ANONYMOUS,
+      -1,
+      0);
+
+  if (code->buffer == MAP_FAILED) {
+    fprintf(
+        stderr,
+        "Failed to allocate executable memory of size %zu KB\n",
+        size_kb);
+    free(code);
+    return NULL;
+  }
+
+  code->func = (void (*)(void))code->buffer;
+  return code;
+}
+
+// Free code buffer
+void free_code_buffer(CodeBuffer* code) {
+  if (code) {
+    if (code->buffer != MAP_FAILED) {
+      munmap(code->buffer, code->size);
+    }
+    free(code);
+  }
+}
+
+// Generate executable code in buffer
+// Returns the number of NOP instructions generated
+unsigned long generate_code(CodeBuffer* code) {
+  if (!code || !code->buffer) {
+    return 0;
+  }
+
+#if defined(__aarch64__)
+  // ARM64: Generate code
+  uint32_t* ptr = (uint32_t*)code->buffer;
+  size_t num_instructions = code->size / sizeof(uint32_t);
+
+  // Fill buffer with NOPs
+  for (size_t i = 0; i < num_instructions - 1; i++) {
+    ptr[i] = NOP_INSTRUCTION;
+  }
+
+  // Add RET at the end
+  ptr[num_instructions - 1] = RET_INSTRUCTION;
+
+  // Flush instruction cache for ARM
+  __builtin___clear_cache(
+      (char*)code->buffer, (char*)code->buffer + code->size);
+
+  return num_instructions - 1; // All instructions except RET are NOPs
+
+#elif defined(__x86_64__)
+  // x86-64: Generate code with 4-byte NOPs
+  uint32_t* ptr = (uint32_t*)code->buffer;
+  size_t num_instructions = code->size / sizeof(uint32_t);
+
+  // Fill buffer with 4-byte NOPs
+  for (size_t i = 0; i < num_instructions - 1; i++) {
+    ptr[i] = NOP_INSTRUCTION;
+  }
+
+  // Add RET at the end (overwrite last NOP with RET in first byte)
+  uint8_t* ret_ptr = (uint8_t*)&ptr[num_instructions - 1];
+  ret_ptr[0] = RET_INSTRUCTION;
+
+  // x86 typically has coherent I-cache, but flush anyway
+  __builtin___clear_cache(
+      (char*)code->buffer, (char*)code->buffer + code->size);
+
+  return num_instructions - 1; // All instructions except last are NOPs
+
+#endif
+}
+
+// Run a single test with all counter sets
+void run_test(TestResult* result, unsigned long iterations) {
+  PerfCounters perf;
+
+  if (!result->code || !result->code->func) {
+    fprintf(stderr, "Invalid code buffer for test %s\n", result->size_name);
+    return;
+  }
+
+  // Measure with L1I counter set
+  if (perf_counters_init(&perf, COUNTER_SET_L1I, 0) == 0) {
+    perf_counters_enable(&perf);
+    for (unsigned long i = 0; i < iterations; i++) {
+      result->code->func();
+    }
+    perf_counters_disable_and_read(&perf);
+    result->cycles = perf.count_cycles;
+    result->instructions = perf.count_instructions;
+    result->l1i_misses = perf.count_extra;
+    perf_counters_cleanup(&perf);
+  }
+
+  // Measure with iTLB counter set
+  if (perf_counters_init(&perf, COUNTER_SET_ITLB, 0) == 0) {
+    perf_counters_enable(&perf);
+    for (unsigned long i = 0; i < iterations; i++) {
+      result->code->func();
+    }
+    perf_counters_disable_and_read(&perf);
+    result->itlb_misses = perf.count_extra;
+    perf_counters_cleanup(&perf);
+  }
+
+  // Measure with L2 counter set
+  if (perf_counters_init(&perf, COUNTER_SET_L2, 0) == 0) {
+    perf_counters_enable(&perf);
+    for (unsigned long i = 0; i < iterations; i++) {
+      result->code->func();
+    }
+    perf_counters_disable_and_read(&perf);
+    result->l2_loads = perf.count_extra;
+    perf_counters_cleanup(&perf);
+  }
+
+  // Measure with DRAM reads counter set
+  if (perf_counters_init(&perf, COUNTER_SET_DRAM_READS, 0) == 0) {
+    perf_counters_enable(&perf);
+    for (unsigned long i = 0; i < iterations; i++) {
+      result->code->func();
+    }
+    perf_counters_disable_and_read(&perf);
+    result->dram_reads = perf.count_extra;
+    perf_counters_cleanup(&perf);
+  }
+
+  // Calculate bytes per cycle (instructions * instruction_size / cycles)
+  if (result->cycles > 0) {
+    result->bytes_per_cycle =
+        ((double)result->instructions * INSTRUCTION_SIZE) /
+        (double)result->cycles;
+  } else {
+    result->bytes_per_cycle = 0.0;
+  }
+}
+
+// Print results in a table format
+void print_results_header() {
+  printf("\n");
+  printf(
+      "====================================================================================================\n");
+  printf(
+      "%-12s %10s %12s %12s %10s %12s %12s %12s %12s\n",
+      "Size",
+      "NOPs",
+      "Cycles",
+      "Instructions",
+      "Bytes/Cycle",
+      "L1I Misses",
+      "iTLB Misses",
+      "L2 Loads",
+      "DRAM Reads");
+  printf(
+      "====================================================================================================\n");
+}
+
+void print_result(const TestResult* result, unsigned long iterations) {
+  // Calculate per-iteration metrics
+  double cycles_per_iter = (double)result->cycles / iterations;
+  double instructions_per_iter = (double)result->instructions / iterations;
+  double l1i_per_iter = (double)result->l1i_misses / iterations;
+  double itlb_per_iter = (double)result->itlb_misses / iterations;
+  double l2_per_iter = (double)result->l2_loads / iterations;
+  double dram_per_iter = (double)result->dram_reads / iterations;
+
+  printf(
+      "%-12s %10lu %12.0f %12.0f %10.4f %12.2f %12.2f %12.2f %12.2f\n",
+      result->size_name,
+      result->num_nops,
+      cycles_per_iter,
+      instructions_per_iter,
+      result->bytes_per_cycle,
+      l1i_per_iter,
+      itlb_per_iter,
+      l2_per_iter,
+      dram_per_iter);
+}
+
+void print_usage(const char* program_name) {
+  fprintf(
+      stderr,
+      "Usage: %s [-i <iterations>] [-s <specific_size_kb>]\n",
+      program_name);
+  fprintf(
+      stderr,
+      "  -i <iterations>: Number of iterations per test (default: auto-calculated, multiples of 10)\n");
+  fprintf(
+      stderr,
+      "  -s <size_kb>: Test only specific size in KB (1, 4, 8, 16, 32, 64, 128, 192, 256, 512, 1024, 4096, 16384, 32768, 65536, 196608, 262144, 524288, 1048576)\n");
+  fprintf(stderr, "  -h: Show this help message\n");
+}
+
+int main(int argc, char* argv[]) {
+  unsigned long iterations = 0; // 0 means auto-calculate
+  long specific_size_kb = -1; // -1 means run all sizes
+
+  // Parse command-line arguments
+  int opt;
+  while ((opt = getopt(argc, argv, "i:s:h")) != -1) {
+    switch (opt) {
+      case 'i':
+        iterations = strtoul(optarg, NULL, 10);
+        break;
+      case 's':
+        specific_size_kb = strtol(optarg, NULL, 10);
+        break;
+      case 'h':
+        print_usage(argv[0]);
+        return 0;
+      default:
+        print_usage(argv[0]);
+        return 1;
+    }
+  }
+
+  // Define test sizes in KB
+  unsigned long test_sizes_kb[] = {
+      1,
+      4,
+      8,
+      16,
+      32,
+      64,
+      128,
+      192,
+      256,
+      512,
+      1024,
+      4096,
+      16384,
+      32768,
+      65536,
+      196608,
+      262144,
+      524288,
+      1048576};
+  int num_sizes = sizeof(test_sizes_kb) / sizeof(test_sizes_kb[0]);
+
+  printf("Dynamic Instruction Throughput Benchmark\n");
+  printf("Architecture: ");
+#if defined(__aarch64__) || defined(__arm__)
+  printf("ARM/AArch64\n");
+  printf("Instruction size: 4 bytes (NOP)\n");
+#elif defined(__x86_64__) || defined(__i386__)
+  printf("x86/x86_64\n");
+  printf("Instruction size: 4 bytes (multi-byte NOP)\n");
+#else
+  printf("Unknown\n");
+#endif
+
+  unsigned long cpu_freq = get_cpu_frequency_hz();
+  printf("CPU Frequency: %.2f GHz\n", (double)cpu_freq / 1e9);
+
+  // Allocate and generate code buffers
+  printf("\nAllocating and generating code buffers...\n");
+  TestResult* tests = (TestResult*)calloc(num_sizes, sizeof(TestResult));
+  if (!tests) {
+    fprintf(stderr, "Failed to allocate test results array\n");
+    return 1;
+  }
+
+  int num_tests = 0;
+  for (int i = 0; i < num_sizes; i++) {
+    unsigned long size_kb = test_sizes_kb[i];
+
+    // Skip if specific size requested and this isn't it
+    if (specific_size_kb != -1 && size_kb != (unsigned long)specific_size_kb) {
+      continue;
+    }
+
+    printf("  Creating %lu KB code buffer...\n", size_kb);
+
+    CodeBuffer* code = create_code_buffer(size_kb);
+    if (!code) {
+      fprintf(stderr, "Failed to create %lu KB buffer, skipping\n", size_kb);
+      continue;
+    }
+
+    unsigned long num_nops = generate_code(code);
+
+    // Create size name
+    char* size_name = (char*)malloc(32);
+    if (size_kb >= 1024) {
+      snprintf(size_name, 32, "%luM", size_kb / 1024);
+    } else {
+      snprintf(size_name, 32, "%luK", size_kb);
+    }
+
+    tests[num_tests].size_name = size_name;
+    tests[num_tests].size_kb = size_kb;
+    tests[num_tests].num_nops = num_nops;
+    tests[num_tests].code = code;
+    tests[num_tests].cycles = 0;
+    tests[num_tests].instructions = 0;
+    tests[num_tests].l1i_misses = 0;
+    tests[num_tests].itlb_misses = 0;
+    tests[num_tests].l2_loads = 0;
+    tests[num_tests].dram_reads = 0;
+    tests[num_tests].bytes_per_cycle = 0.0;
+
+    num_tests++;
+  }
+
+  printf("\nRunning tests...\n");
+  print_results_header();
+
+  // Run tests
+  for (int i = 0; i < num_tests; i++) {
+    // Auto-calculate iterations if not specified
+    // Use fewer iterations for larger functions to keep runtime reasonable
+    // Ensure iterations are multiples of 10
+    unsigned long test_iterations = iterations;
+    if (test_iterations == 0) {
+      if (tests[i].size_kb <= 64) {
+        test_iterations = 1000000;
+      } else if (tests[i].size_kb <= 512) {
+        test_iterations = 100000;
+      } else if (tests[i].size_kb <= 4096) {
+        test_iterations = 10000;
+      } else if (tests[i].size_kb <= 65536) {
+        test_iterations = 1000;
+      } else {
+        test_iterations = 100;
+      }
+
+      // Ensure it's a multiple of 10
+      test_iterations = (test_iterations / 10) * 10;
+      if (test_iterations == 0) {
+        test_iterations = 10;
+      }
+    }
+
+    run_test(&tests[i], test_iterations);
+    print_result(&tests[i], test_iterations);
+  }
+
+  printf(
+      "====================================================================================================\n");
+  printf("\nNote: All metrics shown are per-iteration averages.\n");
+
+  // Cleanup
+  for (int i = 0; i < num_tests; i++) {
+    free_code_buffer(tests[i].code);
+    free((void*)tests[i].size_name);
+  }
+  free(tests);
+
+  return 0;
+}
diff --git a/uarch_bench/run_benchmark.sh b/uarch_bench/run_benchmark.sh
new file mode 100644
index 00000000..5012a4b9
--- /dev/null
+++ b/uarch_bench/run_benchmark.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Script to run frontend_study with various parameters and collect performance metrics
+# Input: Text file with comma-separated values (one configuration per line)
+# Output: CSV format with performance metrics
+
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <input_file>"
+    exit 1
+fi
+
+INPUT_FILE="$1"
+
+if [ ! -f "$INPUT_FILE" ]; then
+    echo "Error: Input file '$INPUT_FILE' not found"
+    exit 1
+fi
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+EXECUTABLE="$SCRIPT_DIR/frontend_study"
+
+if [ ! -x "$EXECUTABLE" ]; then
+    echo "Error: Executable '$EXECUTABLE' not found or not executable"
+    exit 1
+fi
+
+# Print CSV header
+echo "buffer_size_mb,num_buffers,function_size_bytes,elapsed_time,iterations_per_sec,cycles,instructions,l1_cache_misses_per_iter,l1_tlb_misses_per_iter,branch_misses_per_iter"
+
+# Process each line in the input file
+while IFS= read -r line; do
+    # Skip empty lines
+    [[ -z "$line" ]] && continue
+
+    # Parse comma-separated values
+    IFS=',' read -r divisor iterations buffer_size num_buffers page_size func_sel random_jumps <<< "$line"
+
+    # Trim whitespace
+    divisor=$(echo "$divisor" | xargs)
+    iterations=$(echo "$iterations" | xargs)
+    buffer_size=$(echo "$buffer_size" | xargs)
+    num_buffers=$(echo "$num_buffers" | xargs)
+    page_size=$(echo "$page_size" | xargs)
+    func_sel=$(echo "$func_sel" | xargs)
+    random_jumps=$(echo "$random_jumps" | xargs)
+
+    # Run the executable with the parsed parameters
+    RUN_OUTPUT=$($EXECUTABLE -d $divisor -i $iterations -b $buffer_size -n $num_buffers -s $page_size -f $func_sel -r $random_jumps 2>&1)
+
+    # Extract performance metrics from the output
+    FUNCTION_SIZE=$(echo "$RUN_OUTPUT" | grep -oP 'Estimated function size:\s+\K[0-9]+' || echo "0")
+    ELAPSED_TIME=$(echo "$RUN_OUTPUT" | grep -oP 'Elapsed time:\s+\K[0-9.]+' || echo "0")
+    ITER_PER_SEC=$(echo "$RUN_OUTPUT" | grep -oP 'Iterations per second:\s+\K[0-9.]+' || echo "0")
+
+    # Get cycles and instructions
+    CYCLES=$(echo "$RUN_OUTPUT" | grep -oP 'Cycles:\s+\K[0-9]+' | tr -d ',' | head -n 1 || echo "0")
+    INSTRUCTIONS=$(echo "$RUN_OUTPUT" | grep -oP 'Instructions:\s+\K[0-9]+' | tr -d ',' | head -n 1 || echo "0")
+
+    # Get L1 cache misses, L1 TLB misses, and branch misses
+    L1_CACHE_MISSES_PER_ITER=$(echo "$RUN_OUTPUT" | grep -oP 'L1I Misses / Iteration:\s+\K[0-9]+(?:\.[0-9]+)?' | tr -d ',' || echo "0")
+    L1_TLB_MISSES_PER_ITER=$(echo "$RUN_OUTPUT" | grep -oP 'iTLB Misses / Iteration:\s+\K[0-9]+(?:\.[0-9]+)?' | tr -d ',' || echo "0")
+    BRANCH_MISSES_PER_ITER=$(echo "$RUN_OUTPUT" | grep -oP 'Branch Misses / Iteration:\s+\K[0-9]+(?:\.[0-9]+)?' | tr -d ',' || echo "0")
+
+    # Output CSV row
+    echo "$buffer_size,$num_buffers,$FUNCTION_SIZE,$ELAPSED_TIME,$ITER_PER_SEC,$CYCLES,$INSTRUCTIONS,$L1_CACHE_MISSES_PER_ITER,$L1_TLB_MISSES_PER_ITER,$BRANCH_MISSES_PER_ITER"
+
+done < "$INPUT_FILE"
diff --git a/uarch_bench/utils.c b/uarch_bench/utils.c
new file mode 100644
index 00000000..7275f9ab
--- /dev/null
+++ b/uarch_bench/utils.c
@@ -0,0 +1,272 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#define _GNU_SOURCE
+#include "utils.h"
+#include <errno.h>
+#include <linux/perf_event.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+// Wrapper for perf_event_open syscall
+static long perf_event_open(
+    struct perf_event_attr* hw_event,
+    pid_t pid,
+    int cpu,
+    int group_fd,
+    unsigned long flags) {
+  return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
+}
+
+// Setup a perf counter (internal helper)
+static int setup_perf_counter(uint32_t type, uint64_t config, int group_fd) {
+  struct perf_event_attr pe;
+  memset(&pe, 0, sizeof(struct perf_event_attr));
+  pe.type = type;
+  pe.size = sizeof(struct perf_event_attr);
+  pe.config = config;
+  pe.disabled = 1; // Start disabled
+  pe.exclude_kernel = 1;
+  pe.exclude_hv = 1;
+
+  int fd = perf_event_open(&pe, 0, -1, group_fd, 0);
+  if (fd == -1) {
+    fprintf(
+        stderr,
+        "Error opening perf counter (type=%u, config=%llu): %s\n",
+        type,
+        (unsigned long long)config,
+        strerror(errno));
+  }
+  return fd;
+}
+
+// Initialize perf counters
+int perf_counters_init(
+    PerfCounters* perf,
+    CounterSet counter_set,
+    int verbose) {
+  perf->cpu_freq_hz = get_cpu_frequency_hz();
+  perf->counter_set = counter_set;
+
+  const char* counter_name = "";
+  uint64_t extra_config = 0;
+  uint32_t extra_type = PERF_TYPE_HARDWARE;
+
+  // Configure the extra counter based on counter set
+  switch (counter_set) {
+    case COUNTER_SET_ITLB:
+      counter_name = "iTLB-load-misses";
+      extra_type = PERF_TYPE_HW_CACHE;
+      extra_config = (PERF_COUNT_HW_CACHE_ITLB) |
+          (PERF_COUNT_HW_CACHE_OP_READ << 8) |
+          (PERF_COUNT_HW_CACHE_RESULT_MISS << 16);
+      break;
+    case COUNTER_SET_L1I:
+      counter_name = "L1I-cache-load-misses";
+      extra_type = PERF_TYPE_HW_CACHE;
+      extra_config = (PERF_COUNT_HW_CACHE_L1I) |
+          (PERF_COUNT_HW_CACHE_OP_READ << 8) |
+          (PERF_COUNT_HW_CACHE_RESULT_MISS << 16);
+      break;
+    case COUNTER_SET_BRANCH:
+      counter_name = "branch-misses";
+      extra_type = PERF_TYPE_HARDWARE;
+      extra_config = PERF_COUNT_HW_BRANCH_MISSES;
+      break;
+    case COUNTER_SET_L2:
+      counter_name = "L2/LL-cache-loads";
+      extra_type = PERF_TYPE_HW_CACHE;
+      // Note: PERF_COUNT_HW_CACHE_LL refers to Last Level Cache
+      // On most ARM systems, this is L2 cache (or L3 if present)
+      extra_config = (PERF_COUNT_HW_CACHE_LL) |
+          (PERF_COUNT_HW_CACHE_OP_READ << 8) |
+          (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16);
+      break;
+    case COUNTER_SET_DRAM_READS:
+      counter_name = "DRAM-reads";
+      extra_type = PERF_TYPE_HW_CACHE;
+      // Measure last-level cache misses which indicate DRAM reads
+      extra_config = (PERF_COUNT_HW_CACHE_LL) |
+          (PERF_COUNT_HW_CACHE_OP_READ << 8) |
+          (PERF_COUNT_HW_CACHE_RESULT_MISS << 16);
+      break;
+  }
+
+  if (verbose == 1) {
+    printf(
+        "Setting up perf counters: cycles, instructions, %s\n", counter_name);
+  }
+
+  perf->fd_cycles =
+      setup_perf_counter(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, -1);
+  perf->fd_instructions = setup_perf_counter(
+      PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, perf->fd_cycles);
+  perf->fd_extra =
+      setup_perf_counter(extra_type, extra_config, perf->fd_cycles);
+
+  if (perf->fd_cycles == -1 || perf->fd_instructions == -1 ||
+      perf->fd_extra == -1) {
+    fprintf(
+        stderr,
+        "Warning: Some perf counters unavailable, continuing without them\n");
+    perf->available = 0;
+    return -1;
+  }
+
+  perf->available = 1;
+  perf->count_cycles = 0;
+  perf->count_instructions = 0;
+  perf->count_extra = 0;
+  return 0;
+}
+
+// Enable perf counters
+void perf_counters_enable(PerfCounters* perf) {
+  if (!perf->available) {
+    return;
+  }
+
+  ioctl(perf->fd_cycles, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
+  ioctl(perf->fd_cycles, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
+}
+
+// Disable perf counters and read results
+void perf_counters_disable_and_read(PerfCounters* perf) {
+  if (!perf->available) {
+    return;
+  }
+
+  ioctl(perf->fd_cycles, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
+
+  read(perf->fd_cycles, &perf->count_cycles, sizeof(long long));
+  read(perf->fd_instructions, &perf->count_instructions, sizeof(long long));
+  read(perf->fd_extra, &perf->count_extra, sizeof(long long));
+}
+
+// Print all measurement results
+void print_measurement_results(
+    const MeasurementResults* results,
+    unsigned long long iterations) {
+  printf("\n=== Measurement Results ===\n");
+  printf("Iterations: %llu\n\n", iterations);
+
+  // iTLB measurements
+  printf("--- iTLB Measurement ---\n");
+  printf("Cycles: %lld\n", results->cycles_itlb);
+  printf("Instructions: %lld\n", results->instructions_itlb);
+  printf("iTLB Load Misses: %lld\n", results->itlb_misses);
+  if (iterations > 0) {
+    double itlb_per_iter = (double)results->itlb_misses / (double)iterations;
+    double cycles_per_iter = (double)results->cycles_itlb / (double)iterations;
+    printf("iTLB Misses / Iteration: %.6f\n", itlb_per_iter);
+    printf("Cycles / Iteration: %.6f\n", cycles_per_iter);
+  }
+
+  // L1I measurements
+  printf("\n--- L1I Cache Measurement ---\n");
+  printf("Cycles: %lld\n", results->cycles_l1i);
+  printf("Instructions: %lld\n", results->instructions_l1i);
+  printf("L1I Cache Load Misses: %lld\n", results->l1i_misses);
+  if (iterations > 0) {
+    double l1i_per_iter = (double)results->l1i_misses / (double)iterations;
+    double cycles_per_iter = (double)results->cycles_l1i / (double)iterations;
+    printf("L1I Misses / Iteration: %.6f\n", l1i_per_iter);
+    printf("Cycles / Iteration: %.6f\n", cycles_per_iter);
+  }
+
+  // Branch measurements
+  printf("\n--- Branch Prediction Measurement ---\n");
+  printf("Cycles: %lld\n", results->cycles_branch);
+  printf("Instructions: %lld\n", results->instructions_branch);
+  printf("Branch Misses: %lld\n", results->branch_misses);
+  if (iterations > 0) {
+    double branch_per_iter =
+        (double)results->branch_misses / (double)iterations;
+    double cycles_per_iter =
+        (double)results->cycles_branch / (double)iterations;
+    printf("Branch Misses / Iteration: %.6f\n", branch_per_iter);
+    printf("Cycles / Iteration: %.6f\n", cycles_per_iter);
+  }
+
+  // L2 Cache measurements
+  // Note: ARM cache line size is typically 64 bytes
+  // Reference: ARM Architecture Reference Manual, most ARM cores use 64-byte
+  // cache lines
+  printf("\n--- L2 Cache Measurement ---\n");
+  printf("Cycles: %lld\n", results->cycles_l2);
+  printf("Instructions: %lld\n", results->instructions_l2);
+  printf("L2 Cache Loads: %lld\n", results->l2_loads);
+  if (iterations > 0) {
+    const long long CACHE_LINE_SIZE = 64; // bytes
+    double l2_loads_per_iter = (double)results->l2_loads / (double)iterations;
+    double l2_bytes_per_iter = l2_loads_per_iter * CACHE_LINE_SIZE;
+    double cycles_per_iter = (double)results->cycles_l2 / (double)iterations;
+    printf("L2 Loads / Iteration: %.6f\n", l2_loads_per_iter);
+    printf("L2 Bytes Loaded / Iteration: %.2f bytes\n", l2_bytes_per_iter);
+    printf("Cycles / Iteration: %.6f\n", cycles_per_iter);
+  }
+}
+
+// Cleanup perf counters
+void perf_counters_cleanup(PerfCounters* perf) {
+  if (!perf->available) {
+    return;
+  }
+
+  close(perf->fd_cycles);
+  close(perf->fd_instructions);
+  close(perf->fd_extra);
+}
+
+// Read CPU frequency in Hz
+unsigned long long get_cpu_frequency_hz(void) {
+  FILE* fp;
+  unsigned long long freq_khz = 0;
+
+  // Try to read from sysfs (frequency in kHz)
+  fp = fopen("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", "r");
+  if (fp) {
+    if (fscanf(fp, "%llu", &freq_khz) == 1) {
+      fclose(fp);
+      return freq_khz * 1000; // Convert kHz to Hz
+    }
+    fclose(fp);
+  }
+
+  // Fallback: try to read from /proc/cpuinfo
+  fp = fopen("/proc/cpuinfo", "r");
+  if (fp) {
+    char line[256];
+    while (fgets(line, sizeof(line), fp)) {
+      // Look for "cpu MHz" line
+      if (strstr(line, "cpu MHz")) {
+        float freq_mhz = 0;
+        if (sscanf(line, "cpu MHz : %f", &freq_mhz) == 1) {
+          fclose(fp);
+          return (unsigned long long)(freq_mhz * 1000000); // Convert MHz to Hz
+        }
+      }
+    }
+    fclose(fp);
+  }
+
+  fprintf(
+      stderr, "Warning: Could not read CPU frequency, using default 2.0 GHz\n");
+  return 2000000000ULL; // Default to 2 GHz
+}
+
+// Simple inline random number generator (LCG)
+static unsigned long rand_state = 1; // 42069420;
+
+void my_srand(unsigned long seed) {
+  rand_state = seed;
+}
+
+unsigned long my_rand(void) {
+  rand_state = rand_state * 1103515245 + 12345;
+  return (rand_state / 65536) % 32768;
+}
diff --git a/uarch_bench/utils.h b/uarch_bench/utils.h
new file mode 100644
index 00000000..dd028f1b
--- /dev/null
+++ b/uarch_bench/utils.h
@@ -0,0 +1,61 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#ifndef UTILS_H
+#define UTILS_H
+
+// Counter type for different measurement passes
+typedef enum {
+  COUNTER_SET_ITLB,
+  COUNTER_SET_L1I,
+  COUNTER_SET_BRANCH,
+  COUNTER_SET_L2,
+  COUNTER_SET_DRAM_READS
+} CounterSet;
+
+// Perf counter management struct
+typedef struct {
+  int fd_cycles;
+  int fd_instructions;
+  int fd_extra; // Can be iTLB, L1I, or branch depending on counter set
+  int available;
+  long long count_cycles;
+  long long count_instructions;
+  long long count_extra;
+  unsigned long long cpu_freq_hz;
+  CounterSet counter_set;
+} PerfCounters;
+
+// Perf counter functions
+int perf_counters_init(PerfCounters* perf, CounterSet counter_set, int verbose);
+void perf_counters_enable(PerfCounters* perf);
+void perf_counters_disable_and_read(PerfCounters* perf);
+void perf_counters_cleanup(PerfCounters* perf);
+
+// Results structure to hold all measurements
+typedef struct {
+  long long cycles_itlb;
+  long long instructions_itlb;
+  long long itlb_misses;
+  long long cycles_l1i;
+  long long instructions_l1i;
+  long long l1i_misses;
+  long long cycles_branch;
+  long long instructions_branch;
+  long long branch_misses;
+  long long cycles_l2;
+  long long instructions_l2;
+  long long l2_loads;
+} MeasurementResults;
+
+void print_measurement_results(
+    const MeasurementResults* results,
+    unsigned long long iterations);
+
+// CPU frequency detection
+unsigned long long get_cpu_frequency_hz(void);
+
+// Random number generator
+void my_srand(unsigned long seed);
+unsigned long my_rand(void);
+
+#endif // UTILS_H