diff --git a/.github/actions/build-image/action.yml b/.github/actions/build-image/action.yml
new file mode 100644
index 000000000000..6151128ea564
--- /dev/null
+++ b/.github/actions/build-image/action.yml
@@ -0,0 +1,76 @@
+name: 'Build Docker Image'
+description: 'Build and push Docker images for different frameworks and targets'
+
+inputs:
+  framework:
+    description: 'Framework name (vllm, sglang)'
+    required: true
+  target:
+    description: 'Docker build target'
+    required: true
+  base-image:
+    description: 'Base Docker image'
+    required: true
+  framework-version:
+    description: 'Framework version'
+    required: true
+  container-type:
+    description: 'Container type (e.g., general)'
+    required: true
+  aws-account-id:
+    description: 'AWS account ID for ECR'
+    required: true
+  aws-region:
+    description: 'AWS region'
+    required: true
+  tag:
+    description: 'Complete image tag (e.g., vllm-0.12.0-gpu-py312-cu129-ubuntu22.04-ec2-pr-123)'
+    required: true
+
+outputs:
+  image-uri:
+    description: 'Built image URI'
+    value: ${{ steps.image-uri-build.outputs.CI_IMAGE_URI }}
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Setup buildkitd
+      shell: bash
+      run: .github/scripts/buildkitd.sh
+
+    - name: ECR login
+      uses: ./.github/actions/ecr-authenticate
+      with:
+        aws-account-id: ${{ inputs.aws-account-id }}
+        aws-region: ${{ inputs.aws-region }}
+
+    - name: Resolve image URI for build
+      id: image-uri-build
+      shell: bash
+      run: |
+        CI_IMAGE_URI=${{ inputs.aws-account-id }}.dkr.ecr.${{ inputs.aws-region }}.amazonaws.com/ci:${{ inputs.tag }}
+        echo "Image URI to build: ${CI_IMAGE_URI}"
+        echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_ENV}
+        echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_OUTPUT}
+
+    - name: Build image
+      shell: bash
+      run: |
+        docker buildx build --progress plain \
+          --build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \
+          --build-arg BASE_IMAGE="${{ inputs.base-image }}" \
+          --build-arg CONTAINER_TYPE="${{ inputs.container-type }}" \
+          --build-arg FRAMEWORK="${{ inputs.framework }}" \
+          --build-arg FRAMEWORK_VERSION="${{ inputs.framework-version }}" \
+          --cache-to=type=inline \
+          --cache-from=type=registry,ref=${CI_IMAGE_URI} \
+          --tag ${CI_IMAGE_URI} \
+          --target ${{ inputs.target }} \
+          -f docker/${{ inputs.framework }}/Dockerfile .
+
+    - name: Container push
+      shell: bash
+      run: |
+        docker push ${CI_IMAGE_URI}
+        docker rmi ${CI_IMAGE_URI}
\ No newline at end of file
diff --git a/.github/workflows/pr-sglang.yml b/.github/workflows/pr-sglang.yml
index acb20e8e06b6..14ee7a5142a4 100644
--- a/.github/workflows/pr-sglang.yml
+++ b/.github/workflows/pr-sglang.yml
@@ -93,44 +93,22 @@ jobs:
       group: ${{ github.workflow }}-build-sglang-image-${{ github.event.pull_request.number }}
       cancel-in-progress: true
     outputs:
-      ci-image: ${{ steps.image-uri-build.outputs.CI_IMAGE_URI }}
+      ci-image: ${{ steps.build.outputs.image-uri }}
     steps:
       - uses: actions/checkout@v5
-      - run: .github/scripts/buildkitd.sh
 
-      - name: ECR login
-        uses: ./.github/actions/ecr-authenticate
+      - name: Build image
+        id: build
+        uses: ./.github/actions/build-image
         with:
+          framework: ${{ env.FRAMEWORK }}
+          target: sglang-sagemaker
+          base-image: lmsysorg/sglang:v${{ env.SGLANG_VERSION }}-${{ env.CUDA_VERSION }}-amd64
+          framework-version: ${{ env.SGLANG_VERSION }}
+          container-type: ${{ env.CONTAINER_TYPE }}
           aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
           aws-region: ${{ vars.AWS_REGION }}
-
-      - name: Resolve image URI for build
-        id: image-uri-build
-        run: |
-          CI_IMAGE_URI=${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:sglang-${{ env.SGLANG_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-sagemaker-pr-${{ github.event.pull_request.number }}
-          echo "Image URI to build: ${CI_IMAGE_URI}"
-          echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_ENV}
-          echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_OUTPUT}
-
-      - name: Build image
-        run: |
-          # base image: https://hub.docker.com/r/lmsysorg/sglang/tags
-          docker buildx build --progress plain \
-            --build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \
-            --build-arg BASE_IMAGE="lmsysorg/sglang:v${{ env.SGLANG_VERSION }}-${{ env.CUDA_VERSION }}-amd64" \
-            --build-arg CONTAINER_TYPE="${{ env.CONTAINER_TYPE }}" \
-            --build-arg FRAMEWORK="${{ env.FRAMEWORK }}" \
-            --build-arg FRAMEWORK_VERSION="${{ env.SGLANG_VERSION }}" \
-            --cache-to=type=inline \
-            --cache-from=type=registry,ref=${CI_IMAGE_URI} \
-            --tag ${CI_IMAGE_URI} \
-            --target sglang-sagemaker \
-            -f docker/sglang/Dockerfile .
-
-      - name: Container push
-        run: |
-          docker push ${CI_IMAGE_URI}
-          docker rmi ${CI_IMAGE_URI}
+          tag: ${{ env.FRAMEWORK }}-${{ env.SGLANG_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-sagemaker-pr-${{ github.event.pull_request.number }}
 
   set-test-environment:
     needs: [check-changes, build-sglang-image]
@@ -164,7 +142,7 @@ jobs:
           echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT}
 
   sglang-local-benchmark-test:
-    needs: [set-test-environment, build-sglang-image]
+    needs: [set-test-environment]
     if: success()
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
@@ -222,7 +200,7 @@ jobs:
           --dataset-path /dataset/ShareGPT_V3_unfiltered_cleaned_split.json
 
   sglang-frontend-test:
-    needs: [build-sglang-image, set-test-environment]
+    needs: [set-test-environment]
     if: success()
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
diff --git a/.github/workflows/pr-vllm.yml b/.github/workflows/pr-vllm.yml
index 09a1fcad97a7..f64131e6e274 100644
--- a/.github/workflows/pr-vllm.yml
+++ b/.github/workflows/pr-vllm.yml
@@ -12,20 +12,25 @@ permissions:
   pull-requests: read
 
 env:
-  # CI Image configuration
+  # Common
   CONTAINER_TYPE: "general"
   FRAMEWORK: "vllm"
-  VLLM_VERSION: 0.12.0
-  VLLM_RAYSERVE_VERSION: 0.10.2
   PYTHON_VERSION: "py312"
   CUDA_VERSION: "cu129"
   OS_VERSION: "ubuntu22.04"
-  # Prod Image configuration
+  FORCE_COLOR: "1"
+
+  # vLLM EC2
+  VLLM_VERSION: 0.12.0
   PROD_EC2_IMAGE: vllm:0.12-gpu-py312-ec2
-  PROD_RAYSERVE_IMAGE: vllm:0.10-gpu-py312-rayserve
+
+  # vLLM SageMaker
   PROD_SAGEMAKER_IMAGE: vllm:0.12-gpu-py312
-  # CI environment configuration
-  FORCE_COLOR: "1"
+
+  # vLLM RayServe
+  VLLM_RAYSERVE_VERSION: 0.10.2
+  PROD_RAYSERVE_IMAGE: vllm:0.10-gpu-py312-rayserve
+
 
 jobs:
   gatekeeper:
@@ -95,44 +100,22 @@ jobs:
       group: ${{ github.workflow }}-build-vllm-ec2-image-${{ github.event.pull_request.number }}
       cancel-in-progress: true
     outputs:
-      ci-image: ${{ steps.image-uri-build.outputs.CI_IMAGE_URI }}
+      ci-image: ${{ steps.build.outputs.image-uri }}
     steps:
       - uses: actions/checkout@v5
-      - run: .github/scripts/buildkitd.sh
 
-      - name: ECR login
-        uses: ./.github/actions/ecr-authenticate
+      - name: Build image
+        id: build
+        uses: ./.github/actions/build-image
         with:
+          framework: ${{ env.FRAMEWORK }}
+          target: vllm-ec2
+          base-image: vllm/vllm-openai:v${{ env.VLLM_VERSION }}
+          framework-version: ${{ env.VLLM_VERSION }}
+          container-type: ${{ env.CONTAINER_TYPE }}
           aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
           aws-region: ${{ vars.AWS_REGION }}
-
-      - name: Resolve image URI for build
-        id: image-uri-build
-        run: |
-          CI_IMAGE_URI=${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:vllm-${{ env.VLLM_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-ec2-pr-${{ github.event.pull_request.number }}
-          echo "Image URI to build: ${CI_IMAGE_URI}"
-          echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_ENV}
-          echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_OUTPUT}
-
-      - name: Build image
-        run: |
-          # base image: https://hub.docker.com/r/vllm/vllm-openai/tags
-          docker buildx build --progress plain \
-            --build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \
-            --build-arg BASE_IMAGE="vllm/vllm-openai:v${{ env.VLLM_VERSION }}" \
-            --build-arg CONTAINER_TYPE="${{ env.CONTAINER_TYPE }}" \
-            --build-arg FRAMEWORK="${{ env.FRAMEWORK }}" \
-            --build-arg FRAMEWORK_VERSION="${{ env.VLLM_VERSION }}" \
-            --cache-to=type=inline \
-            --cache-from=type=registry,ref=${CI_IMAGE_URI} \
-            --tag ${CI_IMAGE_URI} \
-            --target vllm-ec2 \
-            -f docker/vllm/Dockerfile .
-
-      - name: Container push
-        run: |
-          docker push ${CI_IMAGE_URI}
-          docker rmi ${CI_IMAGE_URI}
+          tag: ${{ env.FRAMEWORK }}-${{ env.VLLM_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-ec2-pr-${{ github.event.pull_request.number }}
 
   set-ec2-test-environment:
     needs: [check-changes, build-vllm-ec2-image]
@@ -166,7 +149,7 @@ jobs:
           echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT}
 
   vllm-ec2-regression-test:
-    needs: [build-vllm-ec2-image, set-ec2-test-environment]
+    needs: [set-ec2-test-environment]
     if: success()
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
@@ -205,34 +188,14 @@ jobs:
 
       - name: Setup for vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            # delete old test dependencies file and regen
-            rm vllm_source/requirements/test.txt
-            uv pip compile vllm_source/requirements/test.in -o vllm_source/requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12
-            uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto
-            uv pip install --system pytest pytest-asyncio
-            uv pip install --system -e vllm_source/tests/vllm_test_utils
-            uv pip install --system hf_transfer
-            cd vllm_source
-            mkdir src
-            mv vllm src/vllm
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_12_0_test_setup.sh
 
       - name: Run vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            nvidia-smi
-
-            # Regression Test # 7min
-            cd vllm_source/tests
-            uv pip install --system modelscope
-            pytest -v -s test_regression.py
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_regression_test.sh
 
   vllm-ec2-cuda-test:
-    needs: [build-vllm-ec2-image, set-ec2-test-environment]
+    needs: [set-ec2-test-environment]
     if: success()
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
@@ -271,33 +234,14 @@ jobs:
 
       - name: Setup for vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            # delete old test dependencies file and regen
-            rm vllm_source/requirements/test.txt
-            uv pip compile vllm_source/requirements/test.in -o vllm_source/requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12
-            uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto
-            uv pip install --system pytest pytest-asyncio
-            uv pip install --system -e vllm_source/tests/vllm_test_utils
-            uv pip install --system hf_transfer
-            cd vllm_source
-            mkdir src
-            mv vllm src/vllm
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_12_0_test_setup.sh
 
       - name: Run vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            nvidia-smi
-
-            # Platform Tests (CUDA) # 4min
-            cd vllm_source/tests
-            pytest -v -s cuda/test_cuda_context.py
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_cuda_test.sh
 
   vllm-ec2-example-test:
-    needs: [build-vllm-ec2-image, set-ec2-test-environment]
+    needs: [set-ec2-test-environment]
     if: success()
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
@@ -336,45 +280,11 @@ jobs:
 
       - name: Setup for vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            # delete old test dependencies file and regen
-            rm vllm_source/requirements/test.txt
-            uv pip compile vllm_source/requirements/test.in -o vllm_source/requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12
-            uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto
-            uv pip install --system pytest pytest-asyncio
-            uv pip install --system -e vllm_source/tests/vllm_test_utils
-            uv pip install --system hf_transfer
-            cd vllm_source
-            mkdir src
-            mv vllm src/vllm
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_12_0_test_setup.sh
 
       - name: Run vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            nvidia-smi
-            cd vllm_source/examples
-            pip install tensorizer # for tensorizer test
-            python3 offline_inference/basic/generate.py --model facebook/opt-125m
-            # python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-            python3 offline_inference/basic/chat.py
-            python3 offline_inference/prefix_caching.py
-            python3 offline_inference/llm_engine_example.py
-            python3 offline_inference/audio_language.py --seed 0
-            python3 offline_inference/vision_language.py --seed 0
-            python3 offline_inference/vision_language_pooling.py --seed 0
-            python3 offline_inference/vision_language_multi_image.py --seed 0
-            python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-            python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
-            python3 offline_inference/basic/classify.py
-            python3 offline_inference/basic/embed.py
-            python3 offline_inference/basic/score.py
-            python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-            # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
-            python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_ec2_examples_test.sh
 
   # ===================================================
   # =============== vLLM RayServe jobs ================
@@ -390,44 +300,22 @@ jobs:
       group: ${{ github.workflow }}-build-vllm-rayserve-image-${{ github.event.pull_request.number }}
       cancel-in-progress: true
     outputs:
-      ci-image: ${{ steps.image-uri-build.outputs.CI_IMAGE_URI }}
+      ci-image: ${{ steps.build.outputs.image-uri }}
     steps:
       - uses: actions/checkout@v5
-      - run: .github/scripts/buildkitd.sh
 
-      - name: ECR login
-        uses: ./.github/actions/ecr-authenticate
+      - name: Build image
+        id: build
+        uses: ./.github/actions/build-image
         with:
+          framework: ${{ env.FRAMEWORK }}
+          target: vllm-rayserve-ec2
+          base-image: vllm/vllm-openai:v${{ env.VLLM_RAYSERVE_VERSION }}
+          framework-version: ${{ env.VLLM_RAYSERVE_VERSION }}
+          container-type: ${{ env.CONTAINER_TYPE }}
           aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
           aws-region: ${{ vars.AWS_REGION }}
-
-      - name: Resolve image URI for build
-        id: image-uri-build
-        run: |
-          CI_IMAGE_URI=${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:vllm-${{ env.VLLM_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-rayserve-ec2-pr-${{ github.event.pull_request.number }}
-          echo "Image URI to build: ${CI_IMAGE_URI}"
-          echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_ENV}
-          echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_OUTPUT}
-
-      - name: Build image
-        run: |
-          # base image: https://hub.docker.com/r/vllm/vllm-openai/tags
-          docker buildx build --progress plain \
-            --build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \
-            --build-arg BASE_IMAGE="vllm/vllm-openai:v${{ env.VLLM_RAYSERVE_VERSION }}" \
-            --build-arg CONTAINER_TYPE="${{ env.CONTAINER_TYPE }}" \
-            --build-arg FRAMEWORK="${{ env.FRAMEWORK }}" \
-            --build-arg FRAMEWORK_VERSION="${{ env.VLLM_RAYSERVE_VERSION }}" \
-            --cache-to=type=inline \
-            --cache-from=type=registry,ref=${CI_IMAGE_URI} \
-            --tag ${CI_IMAGE_URI} \
-            --target vllm-rayserve-ec2 \
-            -f docker/vllm/Dockerfile .
-
-      - name: Container push
-        run: |
-          docker push ${CI_IMAGE_URI}
-          docker rmi ${CI_IMAGE_URI}
+          tag: ${{ env.FRAMEWORK }}-${{ env.VLLM_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-rayserve-ec2-pr-${{ github.event.pull_request.number }}
 
   set-rayserve-test-environment:
     needs: [check-changes, build-vllm-rayserve-image]
@@ -461,7 +349,7 @@ jobs:
           echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT}
 
   vllm-rayserve-regression-test:
-    needs: [build-vllm-rayserve-image, set-rayserve-test-environment]
+    needs: [set-rayserve-test-environment]
     if: success()
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
@@ -500,31 +388,14 @@ jobs:
 
       - name: Setup for vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto
-            uv pip install --system pytest pytest-asyncio
-            uv pip install --system -e vllm_source/tests/vllm_test_utils
-            uv pip install --system hf_transfer
-            cd vllm_source
-            mkdir src
-            mv vllm src/vllm
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_10_2_test_setup.sh
 
       - name: Run vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            nvidia-smi
-
-            # Regression Test # 7min
-            cd vllm_source/tests
-            uv pip install --system modelscope
-            pytest -v -s test_regression.py
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_regression_test.sh
 
   vllm-rayserve-cuda-test:
-    needs: [build-vllm-rayserve-image, set-rayserve-test-environment]
+    needs: [set-rayserve-test-environment]
     if: success()
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
@@ -563,30 +434,14 @@ jobs:
 
       - name: Setup for vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto
-            uv pip install --system pytest pytest-asyncio
-            uv pip install --system -e vllm_source/tests/vllm_test_utils
-            uv pip install --system hf_transfer
-            cd vllm_source
-            mkdir src
-            mv vllm src/vllm
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_10_2_test_setup.sh
 
       - name: Run vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            nvidia-smi
-
-            # Platform Tests (CUDA) # 4min
-            cd vllm_source/tests
-            pytest -v -s cuda/test_cuda_context.py
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_cuda_test.sh
 
   vllm-rayserve-example-test:
-    needs: [build-vllm-rayserve-image, set-rayserve-test-environment]
+    needs: [set-rayserve-test-environment]
     if: success()
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
@@ -625,47 +480,11 @@ jobs:
 
       - name: Setup for vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto
-            uv pip install --system pytest pytest-asyncio
-            uv pip install --system -e vllm_source/tests/vllm_test_utils
-            uv pip install --system hf_transfer
-            cd vllm_source
-            mkdir src
-            mv vllm src/vllm
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_10_2_test_setup.sh
 
       - name: Run vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            nvidia-smi
-
-            # Examples Test # 30min
-            cd vllm_source/examples
-            pip install tensorizer # for tensorizer test
-            python3 offline_inference/basic/generate.py --model facebook/opt-125m
-            # python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-            python3 offline_inference/basic/chat.py
-            python3 offline_inference/prefix_caching.py
-            python3 offline_inference/llm_engine_example.py
-
-            # NOTE: Change in Ultravox model changed the class of a audio_processor https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e
-            # vLLM created a fix here https://github.com/vllm-project/vllm/pull/29588 but it is not consumed in vLLM<=0.11
-            # python3 offline_inference/audio_language.py --seed 0
-
-            python3 offline_inference/vision_language.py --seed 0
-            # broken before v0.12.0: https://github.com/vllm-project/vllm/commit/c64c0b78de4716ef019666663c56b6ceaa019463
-            # python3 offline_inference/vision_language_pooling.py --seed
-            # python3 offline_inference/vision_language_multi_image.py --seed 0
-            python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-            python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
-            python3 offline_inference/basic/classify.py
-            python3 offline_inference/basic/embed.py
-            python3 offline_inference/basic/score.py
-            VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_rayserve_examples_test.sh
 
   # ====================================================
   # =============== vLLM SageMaker jobs ================
@@ -681,44 +500,22 @@ jobs:
       group: ${{ github.workflow }}-build-vllm-sagemaker-image-${{ github.event.pull_request.number }}
       cancel-in-progress: true
     outputs:
-      ci-image: ${{ steps.image-uri-build.outputs.CI_IMAGE_URI }}
+      ci-image: ${{ steps.build.outputs.image-uri }}
     steps:
       - uses: actions/checkout@v5
-      - run: .github/scripts/buildkitd.sh
 
-      - name: ECR login
-        uses: ./.github/actions/ecr-authenticate
+      - name: Build image
+        id: build
+        uses: ./.github/actions/build-image
         with:
+          framework: ${{ env.FRAMEWORK }}
+          target: vllm-sagemaker
+          base-image: vllm/vllm-openai:v${{ env.VLLM_VERSION }}
+          framework-version: ${{ env.VLLM_VERSION }}
+          container-type: ${{ env.CONTAINER_TYPE }}
           aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }}
           aws-region: ${{ vars.AWS_REGION }}
-
-      - name: Resolve image URI for build
-        id: image-uri-build
-        run: |
-          CI_IMAGE_URI=${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:vllm-${{ env.VLLM_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-sagemaker-pr-${{ github.event.pull_request.number }}
-          echo "Image URI to build: ${CI_IMAGE_URI}"
-          echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_ENV}
-          echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_OUTPUT}
-
-      - name: Build image
-        run: |
-          # base image: https://hub.docker.com/r/vllm/vllm-openai/tags
-          docker buildx build --progress plain \
-            --build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \
-            --build-arg BASE_IMAGE="vllm/vllm-openai:v${{ env.VLLM_VERSION }}" \
-            --build-arg CONTAINER_TYPE="${{ env.CONTAINER_TYPE }}" \
-            --build-arg FRAMEWORK="${{ env.FRAMEWORK }}" \
-            --build-arg FRAMEWORK_VERSION="${{ env.VLLM_VERSION }}" \
-            --cache-to=type=inline \
-            --cache-from=type=registry,ref=${CI_IMAGE_URI} \
-            --tag ${CI_IMAGE_URI} \
-            --target vllm-sagemaker \
-            -f docker/vllm/Dockerfile .
-
-      - name: Container push
-        run: |
-          docker push ${CI_IMAGE_URI}
-          docker rmi ${CI_IMAGE_URI}
+          tag: ${{ env.FRAMEWORK }}-${{ env.VLLM_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-sagemaker-pr-${{ github.event.pull_request.number }}
 
   set-sagemaker-test-environment:
     needs: [check-changes, build-vllm-sagemaker-image]
@@ -752,7 +549,7 @@ jobs:
           echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT}
 
   vllm-sagemaker-regression-test:
-    needs: [build-vllm-sagemaker-image, set-sagemaker-test-environment]
+    needs: [set-sagemaker-test-environment]
     if: success()
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
@@ -791,34 +588,14 @@ jobs:
 
       - name: Setup for vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            # delete old test dependencies file and regen
-            rm vllm_source/requirements/test.txt
-            uv pip compile vllm_source/requirements/test.in -o vllm_source/requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12
-            uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto
-            uv pip install --system pytest pytest-asyncio
-            uv pip install --system -e vllm_source/tests/vllm_test_utils
-            uv pip install --system hf_transfer
-            cd vllm_source
-            mkdir src
-            mv vllm src/vllm
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_12_0_test_setup.sh
 
       - name: Run vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            nvidia-smi
-
-            # Regression Test # 7min
-            cd vllm_source/tests
-            uv pip install --system modelscope
-            pytest -v -s test_regression.py
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_regression_test.sh
 
   vllm-sagemaker-cuda-test:
-    needs: [build-vllm-sagemaker-image, set-sagemaker-test-environment]
+    needs: [set-sagemaker-test-environment]
     if: success()
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
@@ -857,33 +634,14 @@ jobs:
 
       - name: Setup for vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            # delete old test dependencies file and regen
-            rm vllm_source/requirements/test.txt
-            uv pip compile vllm_source/requirements/test.in -o vllm_source/requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12
-            uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto
-            uv pip install --system pytest pytest-asyncio
-            uv pip install --system -e vllm_source/tests/vllm_test_utils
-            uv pip install --system hf_transfer
-            cd vllm_source
-            mkdir src
-            mv vllm src/vllm
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_12_0_test_setup.sh
 
       - name: Run vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            nvidia-smi
-
-            # Platform Tests (CUDA) # 4min
-            cd vllm_source/tests
-            pytest -v -s cuda/test_cuda_context.py
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_cuda_test.sh
 
   vllm-sagemaker-example-test:
-    needs: [build-vllm-sagemaker-image, set-sagemaker-test-environment]
+    needs: [set-sagemaker-test-environment]
     if: success()
     runs-on:
       - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
@@ -922,63 +680,11 @@ jobs:
 
       - name: Setup for vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            # delete old test dependencies file and regen
-            rm vllm_source/requirements/test.txt
-            uv pip compile vllm_source/requirements/test.in -o vllm_source/requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12
-            uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto
-            uv pip install --system pytest pytest-asyncio
-            uv pip install --system -e vllm_source/tests/vllm_test_utils
-            uv pip install --system hf_transfer
-            cd vllm_source
-            mkdir src
-            mv vllm src/vllm
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_12_0_test_setup.sh
 
       - name: Run vLLM tests
         run: |
-          docker exec ${CONTAINER_ID} sh -c '
-            set -eux
-            nvidia-smi
-
-            cd vllm_source
-
-            # Test LoRA adapter loading/unloading via SageMaker endpoints
-            pytest tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py -v
-
-            # Test stateful session management
-            pytest tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py -v
-
-            # Test sagemaker custom middleware
-            pytest tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py -v
-
-            # Test sagemaker endpoint overrides
-            pytest tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py -v
-
-            # Test LoRA adapter loading/unloading via original OpenAI API server endpoints
-            pytest tests/entrypoints/openai/test_lora_adapters.py -v
-
-            cd examples
-            pip install tensorizer # for tensorizer test
-            python3 offline_inference/basic/generate.py --model facebook/opt-125m
-            # python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-            python3 offline_inference/basic/chat.py
-            python3 offline_inference/prefix_caching.py
-            python3 offline_inference/llm_engine_example.py
-            python3 offline_inference/audio_language.py --seed 0
-            python3 offline_inference/vision_language.py --seed 0
-            python3 offline_inference/vision_language_pooling.py --seed 0
-            python3 offline_inference/vision_language_multi_image.py --seed 0
-            python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-            python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
-            python3 offline_inference/basic/classify.py
-            python3 offline_inference/basic/embed.py
-            python3 offline_inference/basic/score.py
-            python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-            # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
-            python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
-          '
+          docker exec ${CONTAINER_ID} scripts/vllm/vllm_sagemaker_examples_test.sh
 
   vllm-sagemaker-endpoint-test:
     needs: [set-sagemaker-test-environment]
diff --git a/scripts/vllm/vllm_0_10_2_test_setup.sh b/scripts/vllm/vllm_0_10_2_test_setup.sh
new file mode 100755
index 000000000000..439f8033ddf9
--- /dev/null
+++ b/scripts/vllm/vllm_0_10_2_test_setup.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+set -eux
+
+uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto
+uv pip install --system pytest pytest-asyncio
+uv pip install --system -e vllm_source/tests/vllm_test_utils
+uv pip install --system hf_transfer
+cd vllm_source
+mkdir src
+mv vllm src/vllm
\ No newline at end of file
diff --git a/scripts/vllm/vllm_0_12_0_test_setup.sh b/scripts/vllm/vllm_0_12_0_test_setup.sh
new file mode 100755
index 000000000000..9a4a09b3fae4
--- /dev/null
+++ b/scripts/vllm/vllm_0_12_0_test_setup.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+set -eux
+
+# delete old test dependencies file and regen
+rm vllm_source/requirements/test.txt
+uv pip compile vllm_source/requirements/test.in -o vllm_source/requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12
+uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto
+uv pip install --system pytest pytest-asyncio
+uv pip install --system -e vllm_source/tests/vllm_test_utils
+uv pip install --system hf_transfer
+cd vllm_source
+mkdir src
+mv vllm src/vllm
\ No newline at end of file
diff --git a/scripts/vllm/vllm_cuda_test.sh b/scripts/vllm/vllm_cuda_test.sh
new file mode 100755
index 000000000000..ed0b2c0082df
--- /dev/null
+++ b/scripts/vllm/vllm_cuda_test.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -eux
+
+nvidia-smi
+# Platform Tests (CUDA) # 4min
+cd vllm_source/tests
+pytest -v -s cuda/test_cuda_context.py
\ No newline at end of file
diff --git a/scripts/vllm/vllm_ec2_examples_test.sh b/scripts/vllm/vllm_ec2_examples_test.sh
new file mode 100755
index 000000000000..ef7f3f0a8c8f
--- /dev/null
+++ b/scripts/vllm/vllm_ec2_examples_test.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -eux
+
+nvidia-smi
+cd vllm_source/examples
+pip install tensorizer # for tensorizer test
+python3 offline_inference/basic/generate.py --model facebook/opt-125m
+# python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+python3 offline_inference/basic/chat.py
+python3 offline_inference/prefix_caching.py
+python3 offline_inference/llm_engine_example.py
+python3 offline_inference/audio_language.py --seed 0
+python3 offline_inference/vision_language.py --seed 0
+python3 offline_inference/vision_language_pooling.py --seed 0
+python3 offline_inference/vision_language_multi_image.py --seed 0
+python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+python3 offline_inference/basic/classify.py
+python3 offline_inference/basic/embed.py
+python3 offline_inference/basic/score.py
+python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
\ No newline at end of file
diff --git a/scripts/vllm/vllm_rayserve_examples_test.sh b/scripts/vllm/vllm_rayserve_examples_test.sh
new file mode 100755
index 000000000000..39de96c4e420
--- /dev/null
+++ b/scripts/vllm/vllm_rayserve_examples_test.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+set -eux
+
+nvidia-smi
+
+# Examples Test # 30min
+cd vllm_source/examples
+pip install tensorizer # for tensorizer test
+python3 offline_inference/basic/generate.py --model facebook/opt-125m
+# python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+python3 offline_inference/basic/chat.py
+python3 offline_inference/prefix_caching.py
+python3 offline_inference/llm_engine_example.py
+
+# NOTE: Change in Ultravox model changed the class of a audio_processor https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e
+# vLLM created a fix here https://github.com/vllm-project/vllm/pull/29588 but it is not consumed in vLLM<=0.11
+# python3 offline_inference/audio_language.py --seed 0
+
+python3 offline_inference/vision_language.py --seed 0
+# broken before v0.12.0: https://github.com/vllm-project/vllm/commit/c64c0b78de4716ef019666663c56b6ceaa019463
+# python3 offline_inference/vision_language_pooling.py --seed
+# python3 offline_inference/vision_language_multi_image.py --seed 0
+python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+python3 offline_inference/basic/classify.py
+python3 offline_inference/basic/embed.py
+python3 offline_inference/basic/score.py
+VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
\ No newline at end of file
diff --git a/scripts/vllm/vllm_regression_test.sh b/scripts/vllm/vllm_regression_test.sh
new file mode 100755
index 000000000000..8de728dbd9d0
--- /dev/null
+++ b/scripts/vllm/vllm_regression_test.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -eux
+nvidia-smi
+
+# Regression Test # 7min
+cd vllm_source/tests
+uv pip install --system modelscope
+pytest -v -s test_regression.py
\ No newline at end of file
diff --git a/scripts/vllm/vllm_sagemaker_examples_test.sh b/scripts/vllm/vllm_sagemaker_examples_test.sh
new file mode 100755
index 000000000000..bed45806d05a
--- /dev/null
+++ b/scripts/vllm/vllm_sagemaker_examples_test.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+set -eux
+nvidia-smi
+
+cd vllm_source
+
+# Test LoRA adapter loading/unloading via SageMaker endpoints
+pytest tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py -v
+
+# Test stateful session management
+pytest tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py -v
+
+# Test sagemaker custom middleware
+pytest tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py -v
+
+# Test sagemaker endpoint overrides
+pytest tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py -v
+
+# Test LoRA adapter loading/unloading via original OpenAI API server endpoints
+pytest tests/entrypoints/openai/test_lora_adapters.py -v
+
+cd examples
+pip install tensorizer # for tensorizer test
+python3 offline_inference/basic/generate.py --model facebook/opt-125m
+# python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+python3 offline_inference/basic/chat.py
+python3 offline_inference/prefix_caching.py
+python3 offline_inference/llm_engine_example.py
+python3 offline_inference/audio_language.py --seed 0
+python3 offline_inference/vision_language.py --seed 0
+python3 offline_inference/vision_language_pooling.py --seed 0
+python3 offline_inference/vision_language_multi_image.py --seed 0
+python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
+python3 offline_inference/basic/classify.py
+python3 offline_inference/basic/embed.py
+python3 offline_inference/basic/score.py
+python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
\ No newline at end of file