Add prefix cache aware benchmarking config

rlakhtakia · rlakhtakia · commit 3e790df0159a · 2025-11-07T22:34:33.000Z
diff --git a/benchmarking/benchmark-values.yaml b/benchmarking/benchmark-values.yaml
@@ -18,10 +18,16 @@ logLevel: INFO
 
 # A GCS bucket path that points to the dataset file.
 # The file will be copied from this path to the local file system
-# at /dataset/dataset.json for use during the run.
-# NOTE: For this dataset to be used, config.data.path must also be explicitly set to /dataset/dataset.json.
+# at /dataset/gcs-dataset.json for use during the run.
+# NOTE: For this dataset to be used, config.data.path must also be explicitly set to /dataset/gcs-dataset.json.
 gcsPath: ""
 
+# A S3 bucket path that points to the dataset file.
+# The file will be copied from this path to the local file system
+# at /dataset/s3-dataset.json for use during the run.
+# NOTE: For this dataset to be used, config.data.path must also be explicitly set to /dataset/s3-dataset.json.
+s3Path: ""
+
 # hfToken optionally creates a secret with the specified token.
 # Can be set using helm install --set hftoken=<token>
 hfToken: ""
diff --git a/benchmarking/download-gcs-results.bash b/benchmarking/download-gcs-results.bash
diff --git a/benchmarking/download-results.bash b/benchmarking/download-results.bash
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Downloads files from a GCS or S3 bucket.
+
+# Check if provider and bucket are provided as arguments
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Usage: $0 <gcs|s3> <BUCKET> [FOLDER_PATH:DEFAULT=benchmark_results]"
+  exit 1
+fi
+
+PROVIDER="$1"
+BUCKET="$2"
+FOLDER_PATH="${3:-benchmark_results/}" # Default to benchmark_results/ if not provided
+
+# Env vars to be passed when calling this script.
+# The id of the benchmark. This is needed to identify what the benchmark is for.
+# It decides the filepath to save the results, which later is used by the jupyter notebook to assign
+# the benchmark_id as data labels for plotting.
+benchmark_id=${benchmark_id:-"inference-extension"}
+# run_id can be used to group different runs of the same benchmarks for comparison.
+run_id=${run_id:-"default-run"}
+output_dir=${output_dir:-'output'}
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+benchmark_output_dir=${SCRIPT_DIR}/${output_dir}/${run_id}/${benchmark_id}
+
+echo "Creating output directory: ${benchmark_output_dir}/results/json/"
+mkdir -p "${benchmark_output_dir}/results/json/"
+
+case "$PROVIDER" in
+  gcs)
+    echo "Downloading gs://${BUCKET}/${FOLDER_PATH} to ${benchmark_output_dir}/results/json/"
+    gsutil cp -r "gs://${BUCKET}/${FOLDER_PATH}" "${benchmark_output_dir}/results/json/"
+    ;;
+  s3)
+    echo "Downloading s3://${BUCKET}/${FOLDER_PATH} to ${benchmark_output_dir}/results/json/"
+    aws s3 cp -r "s3://${BUCKET}/${FOLDER_PATH}" "${benchmark_output_dir}/results/json/"
+    ;;
+  *)
+    echo "Invalid provider: $PROVIDER. Please use 'gcs' or 's3'."
+    exit 1
+    ;;
+esac
+
+echo "Download complete."
diff --git a/benchmarking/inference-perf/templates/job.yaml b/benchmarking/inference-perf/templates/job.yaml
@@ -21,9 +21,18 @@ spec:
       {{- end }}
 {{- if .Values.gcsPath}}
       initContainers:
-        - name: fetch-dataset
+        - name: fetch-gcs-dataset
           image: google/cloud-sdk:latest
-          command: ["sh", "-c", "gsutil cp {{ .Values.gcsPath }} /dataset/dataset.json"]
+          command: ["sh", "-c", "gsutil cp {{ .Values.gcsPath }} /dataset/gcs-dataset.json"]
+          volumeMounts:
+            - name: dataset-volume
+              mountPath: /dataset
+{{- end }}
+{{- if .Values.s3Path}}
+      initContainers:
+        - name: fetch-s3-dataset
+          image: google/cloud-sdk:latest
+          command: ["sh", "-c", "aws s3 cp s3://{{ .Values.s3Path }} /dataset/s3-dataset.json"]
           volumeMounts:
             - name: dataset-volume
               mountPath: /dataset
diff --git a/benchmarking/prefix-cache-aware/high-cache-values.yaml b/benchmarking/prefix-cache-aware/high-cache-values.yaml
@@ -0,0 +1,81 @@
+# High-Cache Configuration
+job:
+  image:
+    repository: quay.io/inference-perf/inference-perf
+    tag: "0.2.0" # Defaults to .Chart.AppVersion
+  serviceAccountName: ""
+  nodeSelector: {}
+  # Example resources:
+  # resources:
+  #   requests:
+  #     cpu: "1"
+  #     memory: "4Gi"
+  #   limits:
+  #     cpu: "2"
+  #     memory: "8Gi"
+  resources: {}
+
+logLevel: INFO
+
+# A GCS bucket path that points to the dataset file.
+# The file will be copied from this path to the local file system
+# at /dataset/dataset.json for use during the run.
+# NOTE: For this dataset to be used, config.data.path must also be explicitly set to /dataset/dataset.json.
+gcsPath: ""
+
+# hfToken optionally creates a secret with the specified token.
+# Can be set using helm install --set hftoken=<token>
+hfToken: ""
+
+config:
+  load:
+    type: constant
+    interval: 15
+    stages:
+    - rate: 100
+      duration: 30
+    - rate: 200
+      duration: 30
+    - rate: 300
+      duration: 30
+    - rate: 400
+      duration: 30
+    - rate: 500
+      duration: 30
+    - rate: 600
+      duration: 30
+    - rate: 700
+      duration: 30
+    - rate: 800
+      duration: 30
+    worker_max_concurrency: 1000
+  api:
+    type: completion
+    streaming: true
+  server:
+    type: vllm
+    model_name: meta-llama/Llama-3.1-8B-Instruct
+    base_url: http://0.0.0.0:8000
+    ignore_eos: true
+  tokenizer:
+    pretrained_model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
+  data:
+    type: shared_prefix
+    shared_prefix:
+      num_groups: 256
+      num_prompts_per_group: 16
+      system_prompt_len: 2048
+      question_len: 256
+      output_len: 256
+  metrics:
+    type: prometheus
+    prometheus:
+      google_managed: true
+  report:
+    request_lifecycle:
+      summary: true
+      per_stage: true
+      per_request: true
+    prometheus:
+      summary: true
+      per_stage: true
diff --git a/benchmarking/prefix-cache-aware/low-cache-values.yaml b/benchmarking/prefix-cache-aware/low-cache-values.yaml
@@ -0,0 +1,81 @@
+# Low-Cache Configuration
+job:
+  image:
+    repository: quay.io/inference-perf/inference-perf
+    tag: "0.2.0" # Defaults to .Chart.AppVersion
+  serviceAccountName: ""
+  nodeSelector: {}
+  # Example resources:
+  # resources:
+  #   requests:
+  #     cpu: "1"
+  #     memory: "4Gi"
+  #   limits:
+  #     cpu: "2"
+  #     memory: "8Gi"
+  resources: {}
+
+logLevel: INFO
+
+# A GCS bucket path that points to the dataset file.
+# The file will be copied from this path to the local file system
+# at /dataset/dataset.json for use during the run.
+# NOTE: For this dataset to be used, config.data.path must also be explicitly set to /dataset/dataset.json.
+gcsPath: ""
+
+# hfToken optionally creates a secret with the specified token.
+# Can be set using helm install --set hftoken=<token>
+hfToken: ""
+
+config:
+  load:
+    type: constant
+    interval: 15
+    stages:
+    - rate: 100
+      duration: 30
+    - rate: 200
+      duration: 30
+    - rate: 300
+      duration: 30
+    - rate: 400
+      duration: 30
+    - rate: 500
+      duration: 30
+    - rate: 600
+      duration: 30
+    - rate: 700
+      duration: 30
+    - rate: 800
+      duration: 30
+    worker_max_concurrency: 1000
+  api:
+    type: completion
+    streaming: true
+  server:
+    type: vllm
+    model_name: meta-llama/Llama-3.1-8B-Instruct
+    base_url: http://0.0.0.0:8000
+    ignore_eos: true
+  tokenizer:
+    pretrained_model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
+  data:
+    type: shared_prefix
+    shared_prefix:
+      num_groups: 256
+      num_prompts_per_group: 16
+      system_prompt_len: 256      # Low-cache setting
+      question_len: 2048      # Low-cache setting
+      output_len: 256
+  metrics:
+    type: prometheus
+    prometheus:
+      google_managed: true
+  report:
+    request_lifecycle:
+      summary: true
+      per_stage: true
+      per_request: true
+    prometheus:
+      summary: true
+      per_stage: true
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -82,6 +82,8 @@ nav:
       - Conformance Tests: guides/conformance-tests.md
   - Performance:
     - Benchmark: performance/benchmark/index.md
+    - Advanced Benchmarking Configs: 
+      - Prefix Cache Aware: performance/benchmark/advanced-configs/prefix-cache-aware.md
     - Regression Testing: performance/regression-testing/index.md
   - Reference:
     - v1 API Reference: reference/spec.md
diff --git a/site-src/performance/benchmark/advanced-configs/prefix-cache-aware.md b/site-src/performance/benchmark/advanced-configs/prefix-cache-aware.md
diff --git a/site-src/performance/benchmark/index.md b/site-src/performance/benchmark/index.md