diff --git a/.github/actions/build-image/action.yml b/.github/actions/build-image/action.yml new file mode 100644 index 000000000000..6151128ea564 --- /dev/null +++ b/.github/actions/build-image/action.yml @@ -0,0 +1,76 @@ +name: 'Build Docker Image' +description: 'Build and push Docker images for different frameworks and targets' + +inputs: + framework: + description: 'Framework name (vllm, sglang)' + required: true + target: + description: 'Docker build target' + required: true + base-image: + description: 'Base Docker image' + required: true + framework-version: + description: 'Framework version' + required: true + container-type: + description: 'Container type (e.g., general)' + required: true + aws-account-id: + description: 'AWS account ID for ECR' + required: true + aws-region: + description: 'AWS region' + required: true + tag: + description: 'Complete image tag (e.g., vllm-0.12.0-gpu-py312-cu129-ubuntu22.04-ec2-pr-123)' + required: true + +outputs: + image-uri: + description: 'Built image URI' + value: ${{ steps.image-uri-build.outputs.CI_IMAGE_URI }} + +runs: + using: 'composite' + steps: + - name: Setup buildkitd + shell: bash + run: .github/scripts/buildkitd.sh + + - name: ECR login + uses: ./.github/actions/ecr-authenticate + with: + aws-account-id: ${{ inputs.aws-account-id }} + aws-region: ${{ inputs.aws-region }} + + - name: Resolve image URI for build + id: image-uri-build + shell: bash + run: | + CI_IMAGE_URI=${{ inputs.aws-account-id }}.dkr.ecr.${{ inputs.aws-region }}.amazonaws.com/ci:${{ inputs.tag }} + echo "Image URI to build: ${CI_IMAGE_URI}" + echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_ENV} + echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_OUTPUT} + + - name: Build image + shell: bash + run: | + docker buildx build --progress plain \ + --build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \ + --build-arg BASE_IMAGE="${{ inputs.base-image }}" \ + --build-arg CONTAINER_TYPE="${{ inputs.container-type }}" \ + --build-arg FRAMEWORK="${{ inputs.framework }}" \ + --build-arg FRAMEWORK_VERSION="${{ inputs.framework-version }}" \ + --cache-to=type=inline \ + --cache-from=type=registry,ref=${CI_IMAGE_URI} \ + --tag ${CI_IMAGE_URI} \ + --target ${{ inputs.target }} \ + -f docker/${{ inputs.framework }}/Dockerfile . + + - name: Container push + shell: bash + run: | + docker push ${CI_IMAGE_URI} + docker rmi ${CI_IMAGE_URI} \ No newline at end of file diff --git a/.github/workflows/pr-sglang.yml b/.github/workflows/pr-sglang.yml index acb20e8e06b6..14ee7a5142a4 100644 --- a/.github/workflows/pr-sglang.yml +++ b/.github/workflows/pr-sglang.yml @@ -93,44 +93,22 @@ jobs: group: ${{ github.workflow }}-build-sglang-image-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: - ci-image: ${{ steps.image-uri-build.outputs.CI_IMAGE_URI }} + ci-image: ${{ steps.build.outputs.image-uri }} steps: - uses: actions/checkout@v5 - - run: .github/scripts/buildkitd.sh - - name: ECR login - uses: ./.github/actions/ecr-authenticate + - name: Build image + id: build + uses: ./.github/actions/build-image with: + framework: ${{ env.FRAMEWORK }} + target: sglang-sagemaker + base-image: lmsysorg/sglang:v${{ env.SGLANG_VERSION }}-${{ env.CUDA_VERSION }}-amd64 + framework-version: ${{ env.SGLANG_VERSION }} + container-type: ${{ env.CONTAINER_TYPE }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - - - name: Resolve image URI for build - id: image-uri-build - run: | - CI_IMAGE_URI=${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:sglang-${{ env.SGLANG_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-sagemaker-pr-${{ github.event.pull_request.number }} - echo "Image URI to build: ${CI_IMAGE_URI}" - echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_ENV} - echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_OUTPUT} - - - name: Build image - run: | - # base image: https://hub.docker.com/r/lmsysorg/sglang/tags - docker buildx build --progress plain \ - --build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \ - --build-arg BASE_IMAGE="lmsysorg/sglang:v${{ env.SGLANG_VERSION }}-${{ env.CUDA_VERSION }}-amd64" \ - --build-arg CONTAINER_TYPE="${{ env.CONTAINER_TYPE }}" \ - --build-arg FRAMEWORK="${{ env.FRAMEWORK }}" \ - --build-arg FRAMEWORK_VERSION="${{ env.SGLANG_VERSION }}" \ - --cache-to=type=inline \ - --cache-from=type=registry,ref=${CI_IMAGE_URI} \ - --tag ${CI_IMAGE_URI} \ - --target sglang-sagemaker \ - -f docker/sglang/Dockerfile . - - - name: Container push - run: | - docker push ${CI_IMAGE_URI} - docker rmi ${CI_IMAGE_URI} + tag: ${{ env.FRAMEWORK }}-${{ env.SGLANG_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-sagemaker-pr-${{ github.event.pull_request.number }} set-test-environment: needs: [check-changes, build-sglang-image] @@ -164,7 +142,7 @@ jobs: echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT} sglang-local-benchmark-test: - needs: [set-test-environment, build-sglang-image] + needs: [set-test-environment] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} @@ -222,7 +200,7 @@ jobs: --dataset-path /dataset/ShareGPT_V3_unfiltered_cleaned_split.json sglang-frontend-test: - needs: [build-sglang-image, set-test-environment] + needs: [set-test-environment] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} diff --git a/.github/workflows/pr-vllm.yml b/.github/workflows/pr-vllm.yml index 09a1fcad97a7..f64131e6e274 100644 --- a/.github/workflows/pr-vllm.yml +++ b/.github/workflows/pr-vllm.yml @@ -12,20 +12,25 @@ permissions: pull-requests: read env: - # CI Image configuration + # Common CONTAINER_TYPE: "general" FRAMEWORK: "vllm" - VLLM_VERSION: 0.12.0 - VLLM_RAYSERVE_VERSION: 0.10.2 PYTHON_VERSION: "py312" CUDA_VERSION: "cu129" OS_VERSION: "ubuntu22.04" - # Prod Image configuration + FORCE_COLOR: "1" + + # vLLM EC2 + VLLM_VERSION: 0.12.0 PROD_EC2_IMAGE: vllm:0.12-gpu-py312-ec2 - PROD_RAYSERVE_IMAGE: vllm:0.10-gpu-py312-rayserve + + # vLLM SageMaker PROD_SAGEMAKER_IMAGE: vllm:0.12-gpu-py312 - # CI environment configuration - FORCE_COLOR: "1" + + # vLLM RayServe + VLLM_RAYSERVE_VERSION: 0.10.2 + PROD_RAYSERVE_IMAGE: vllm:0.10-gpu-py312-rayserve + jobs: gatekeeper: @@ -95,44 +100,22 @@ jobs: group: ${{ github.workflow }}-build-vllm-ec2-image-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: - ci-image: ${{ steps.image-uri-build.outputs.CI_IMAGE_URI }} + ci-image: ${{ steps.build.outputs.image-uri }} steps: - uses: actions/checkout@v5 - - run: .github/scripts/buildkitd.sh - - name: ECR login - uses: ./.github/actions/ecr-authenticate + - name: Build image + id: build + uses: ./.github/actions/build-image with: + framework: ${{ env.FRAMEWORK }} + target: vllm-ec2 + base-image: vllm/vllm-openai:v${{ env.VLLM_VERSION }} + framework-version: ${{ env.VLLM_VERSION }} + container-type: ${{ env.CONTAINER_TYPE }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - - - name: Resolve image URI for build - id: image-uri-build - run: | - CI_IMAGE_URI=${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:vllm-${{ env.VLLM_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-ec2-pr-${{ github.event.pull_request.number }} - echo "Image URI to build: ${CI_IMAGE_URI}" - echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_ENV} - echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_OUTPUT} - - - name: Build image - run: | - # base image: https://hub.docker.com/r/vllm/vllm-openai/tags - docker buildx build --progress plain \ - --build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \ - --build-arg BASE_IMAGE="vllm/vllm-openai:v${{ env.VLLM_VERSION }}" \ - --build-arg CONTAINER_TYPE="${{ env.CONTAINER_TYPE }}" \ - --build-arg FRAMEWORK="${{ env.FRAMEWORK }}" \ - --build-arg FRAMEWORK_VERSION="${{ env.VLLM_VERSION }}" \ - --cache-to=type=inline \ - --cache-from=type=registry,ref=${CI_IMAGE_URI} \ - --tag ${CI_IMAGE_URI} \ - --target vllm-ec2 \ - -f docker/vllm/Dockerfile . - - - name: Container push - run: | - docker push ${CI_IMAGE_URI} - docker rmi ${CI_IMAGE_URI} + tag: ${{ env.FRAMEWORK }}-${{ env.VLLM_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-ec2-pr-${{ github.event.pull_request.number }} set-ec2-test-environment: needs: [check-changes, build-vllm-ec2-image] @@ -166,7 +149,7 @@ jobs: echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT} vllm-ec2-regression-test: - needs: [build-vllm-ec2-image, set-ec2-test-environment] + needs: [set-ec2-test-environment] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} @@ -205,34 +188,14 @@ jobs: - name: Setup for vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - # delete old test dependencies file and regen - rm vllm_source/requirements/test.txt - uv pip compile vllm_source/requirements/test.in -o vllm_source/requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12 - uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto - uv pip install --system pytest pytest-asyncio - uv pip install --system -e vllm_source/tests/vllm_test_utils - uv pip install --system hf_transfer - cd vllm_source - mkdir src - mv vllm src/vllm - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_12_0_test_setup.sh - name: Run vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - nvidia-smi - - # Regression Test # 7min - cd vllm_source/tests - uv pip install --system modelscope - pytest -v -s test_regression.py - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_regression_test.sh vllm-ec2-cuda-test: - needs: [build-vllm-ec2-image, set-ec2-test-environment] + needs: [set-ec2-test-environment] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} @@ -271,33 +234,14 @@ jobs: - name: Setup for vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - # delete old test dependencies file and regen - rm vllm_source/requirements/test.txt - uv pip compile vllm_source/requirements/test.in -o vllm_source/requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12 - uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto - uv pip install --system pytest pytest-asyncio - uv pip install --system -e vllm_source/tests/vllm_test_utils - uv pip install --system hf_transfer - cd vllm_source - mkdir src - mv vllm src/vllm - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_12_0_test_setup.sh - name: Run vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - nvidia-smi - - # Platform Tests (CUDA) # 4min - cd vllm_source/tests - pytest -v -s cuda/test_cuda_context.py - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_cuda_test.sh vllm-ec2-example-test: - needs: [build-vllm-ec2-image, set-ec2-test-environment] + needs: [set-ec2-test-environment] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} @@ -336,45 +280,11 @@ jobs: - name: Setup for vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - # delete old test dependencies file and regen - rm vllm_source/requirements/test.txt - uv pip compile vllm_source/requirements/test.in -o vllm_source/requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12 - uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto - uv pip install --system pytest pytest-asyncio - uv pip install --system -e vllm_source/tests/vllm_test_utils - uv pip install --system hf_transfer - cd vllm_source - mkdir src - mv vllm src/vllm - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_12_0_test_setup.sh - name: Run vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - nvidia-smi - cd vllm_source/examples - pip install tensorizer # for tensorizer test - python3 offline_inference/basic/generate.py --model facebook/opt-125m - # python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 offline_inference/basic/chat.py - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_ec2_examples_test.sh # =================================================== # =============== vLLM RayServe jobs ================ @@ -390,44 +300,22 @@ jobs: group: ${{ github.workflow }}-build-vllm-rayserve-image-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: - ci-image: ${{ steps.image-uri-build.outputs.CI_IMAGE_URI }} + ci-image: ${{ steps.build.outputs.image-uri }} steps: - uses: actions/checkout@v5 - - run: .github/scripts/buildkitd.sh - - name: ECR login - uses: ./.github/actions/ecr-authenticate + - name: Build image + id: build + uses: ./.github/actions/build-image with: + framework: ${{ env.FRAMEWORK }} + target: vllm-rayserve-ec2 + base-image: vllm/vllm-openai:v${{ env.VLLM_RAYSERVE_VERSION }} + framework-version: ${{ env.VLLM_RAYSERVE_VERSION }} + container-type: ${{ env.CONTAINER_TYPE }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - - - name: Resolve image URI for build - id: image-uri-build - run: | - CI_IMAGE_URI=${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:vllm-${{ env.VLLM_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-rayserve-ec2-pr-${{ github.event.pull_request.number }} - echo "Image URI to build: ${CI_IMAGE_URI}" - echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_ENV} - echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_OUTPUT} - - - name: Build image - run: | - # base image: https://hub.docker.com/r/vllm/vllm-openai/tags - docker buildx build --progress plain \ - --build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \ - --build-arg BASE_IMAGE="vllm/vllm-openai:v${{ env.VLLM_RAYSERVE_VERSION }}" \ - --build-arg CONTAINER_TYPE="${{ env.CONTAINER_TYPE }}" \ - --build-arg FRAMEWORK="${{ env.FRAMEWORK }}" \ - --build-arg FRAMEWORK_VERSION="${{ env.VLLM_RAYSERVE_VERSION }}" \ - --cache-to=type=inline \ - --cache-from=type=registry,ref=${CI_IMAGE_URI} \ - --tag ${CI_IMAGE_URI} \ - --target vllm-rayserve-ec2 \ - -f docker/vllm/Dockerfile . - - - name: Container push - run: | - docker push ${CI_IMAGE_URI} - docker rmi ${CI_IMAGE_URI} + tag: ${{ env.FRAMEWORK }}-${{ env.VLLM_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-rayserve-ec2-pr-${{ github.event.pull_request.number }} set-rayserve-test-environment: needs: [check-changes, build-vllm-rayserve-image] @@ -461,7 +349,7 @@ jobs: echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT} vllm-rayserve-regression-test: - needs: [build-vllm-rayserve-image, set-rayserve-test-environment] + needs: [set-rayserve-test-environment] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} @@ -500,31 +388,14 @@ jobs: - name: Setup for vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto - uv pip install --system pytest pytest-asyncio - uv pip install --system -e vllm_source/tests/vllm_test_utils - uv pip install --system hf_transfer - cd vllm_source - mkdir src - mv vllm src/vllm - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_10_2_test_setup.sh - name: Run vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - nvidia-smi - - # Regression Test # 7min - cd vllm_source/tests - uv pip install --system modelscope - pytest -v -s test_regression.py - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_regression_test.sh vllm-rayserve-cuda-test: - needs: [build-vllm-rayserve-image, set-rayserve-test-environment] + needs: [set-rayserve-test-environment] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} @@ -563,30 +434,14 @@ jobs: - name: Setup for vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto - uv pip install --system pytest pytest-asyncio - uv pip install --system -e vllm_source/tests/vllm_test_utils - uv pip install --system hf_transfer - cd vllm_source - mkdir src - mv vllm src/vllm - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_10_2_test_setup.sh - name: Run vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - nvidia-smi - - # Platform Tests (CUDA) # 4min - cd vllm_source/tests - pytest -v -s cuda/test_cuda_context.py - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_cuda_test.sh vllm-rayserve-example-test: - needs: [build-vllm-rayserve-image, set-rayserve-test-environment] + needs: [set-rayserve-test-environment] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} @@ -625,47 +480,11 @@ jobs: - name: Setup for vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto - uv pip install --system pytest pytest-asyncio - uv pip install --system -e vllm_source/tests/vllm_test_utils - uv pip install --system hf_transfer - cd vllm_source - mkdir src - mv vllm src/vllm - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_10_2_test_setup.sh - name: Run vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - nvidia-smi - - # Examples Test # 30min - cd vllm_source/examples - pip install tensorizer # for tensorizer test - python3 offline_inference/basic/generate.py --model facebook/opt-125m - # python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 offline_inference/basic/chat.py - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py - - # NOTE: Change in Ultravox model changed the class of a audio_processor https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e - # vLLM created a fix here https://github.com/vllm-project/vllm/pull/29588 but it is not consumed in vLLM<=0.11 - # python3 offline_inference/audio_language.py --seed 0 - - python3 offline_inference/vision_language.py --seed 0 - # broken before v0.12.0: https://github.com/vllm-project/vllm/commit/c64c0b78de4716ef019666663c56b6ceaa019463 - # python3 offline_inference/vision_language_pooling.py --seed - # python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_rayserve_examples_test.sh # ==================================================== # =============== vLLM SageMaker jobs ================ @@ -681,44 +500,22 @@ jobs: group: ${{ github.workflow }}-build-vllm-sagemaker-image-${{ github.event.pull_request.number }} cancel-in-progress: true outputs: - ci-image: ${{ steps.image-uri-build.outputs.CI_IMAGE_URI }} + ci-image: ${{ steps.build.outputs.image-uri }} steps: - uses: actions/checkout@v5 - - run: .github/scripts/buildkitd.sh - - name: ECR login - uses: ./.github/actions/ecr-authenticate + - name: Build image + id: build + uses: ./.github/actions/build-image with: + framework: ${{ env.FRAMEWORK }} + target: vllm-sagemaker + base-image: vllm/vllm-openai:v${{ env.VLLM_VERSION }} + framework-version: ${{ env.VLLM_VERSION }} + container-type: ${{ env.CONTAINER_TYPE }} aws-account-id: ${{ vars.CI_AWS_ACCOUNT_ID }} aws-region: ${{ vars.AWS_REGION }} - - - name: Resolve image URI for build - id: image-uri-build - run: | - CI_IMAGE_URI=${{ vars.CI_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:vllm-${{ env.VLLM_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-sagemaker-pr-${{ github.event.pull_request.number }} - echo "Image URI to build: ${CI_IMAGE_URI}" - echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_ENV} - echo "CI_IMAGE_URI=${CI_IMAGE_URI}" >> ${GITHUB_OUTPUT} - - - name: Build image - run: | - # base image: https://hub.docker.com/r/vllm/vllm-openai/tags - docker buildx build --progress plain \ - --build-arg CACHE_REFRESH="$(date +"%Y-%m-%d")" \ - --build-arg BASE_IMAGE="vllm/vllm-openai:v${{ env.VLLM_VERSION }}" \ - --build-arg CONTAINER_TYPE="${{ env.CONTAINER_TYPE }}" \ - --build-arg FRAMEWORK="${{ env.FRAMEWORK }}" \ - --build-arg FRAMEWORK_VERSION="${{ env.VLLM_VERSION }}" \ - --cache-to=type=inline \ - --cache-from=type=registry,ref=${CI_IMAGE_URI} \ - --tag ${CI_IMAGE_URI} \ - --target vllm-sagemaker \ - -f docker/vllm/Dockerfile . - - - name: Container push - run: | - docker push ${CI_IMAGE_URI} - docker rmi ${CI_IMAGE_URI} + tag: ${{ env.FRAMEWORK }}-${{ env.VLLM_VERSION }}-gpu-${{ env.PYTHON_VERSION }}-${{ env.CUDA_VERSION }}-${{ env.OS_VERSION }}-sagemaker-pr-${{ github.event.pull_request.number }} set-sagemaker-test-environment: needs: [check-changes, build-vllm-sagemaker-image] @@ -752,7 +549,7 @@ jobs: echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT} vllm-sagemaker-regression-test: - needs: [build-vllm-sagemaker-image, set-sagemaker-test-environment] + needs: [set-sagemaker-test-environment] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} @@ -791,34 +588,14 @@ jobs: - name: Setup for vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - # delete old test dependencies file and regen - rm vllm_source/requirements/test.txt - uv pip compile vllm_source/requirements/test.in -o vllm_source/requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12 - uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto - uv pip install --system pytest pytest-asyncio - uv pip install --system -e vllm_source/tests/vllm_test_utils - uv pip install --system hf_transfer - cd vllm_source - mkdir src - mv vllm src/vllm - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_12_0_test_setup.sh - name: Run vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - nvidia-smi - - # Regression Test # 7min - cd vllm_source/tests - uv pip install --system modelscope - pytest -v -s test_regression.py - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_regression_test.sh vllm-sagemaker-cuda-test: - needs: [build-vllm-sagemaker-image, set-sagemaker-test-environment] + needs: [set-sagemaker-test-environment] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} @@ -857,33 +634,14 @@ jobs: - name: Setup for vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - # delete old test dependencies file and regen - rm vllm_source/requirements/test.txt - uv pip compile vllm_source/requirements/test.in -o vllm_source/requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12 - uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto - uv pip install --system pytest pytest-asyncio - uv pip install --system -e vllm_source/tests/vllm_test_utils - uv pip install --system hf_transfer - cd vllm_source - mkdir src - mv vllm src/vllm - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_12_0_test_setup.sh - name: Run vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - nvidia-smi - - # Platform Tests (CUDA) # 4min - cd vllm_source/tests - pytest -v -s cuda/test_cuda_context.py - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_cuda_test.sh vllm-sagemaker-example-test: - needs: [build-vllm-sagemaker-image, set-sagemaker-test-environment] + needs: [set-sagemaker-test-environment] if: success() runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} @@ -922,63 +680,11 @@ jobs: - name: Setup for vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - # delete old test dependencies file and regen - rm vllm_source/requirements/test.txt - uv pip compile vllm_source/requirements/test.in -o vllm_source/requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12 - uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto - uv pip install --system pytest pytest-asyncio - uv pip install --system -e vllm_source/tests/vllm_test_utils - uv pip install --system hf_transfer - cd vllm_source - mkdir src - mv vllm src/vllm - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_0_12_0_test_setup.sh - name: Run vLLM tests run: | - docker exec ${CONTAINER_ID} sh -c ' - set -eux - nvidia-smi - - cd vllm_source - - # Test LoRA adapter loading/unloading via SageMaker endpoints - pytest tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py -v - - # Test stateful session management - pytest tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py -v - - # Test sagemaker custom middleware - pytest tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py -v - - # Test sagemaker endpoint overrides - pytest tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py -v - - # Test LoRA adapter loading/unloading via original OpenAI API server endpoints - pytest tests/entrypoints/openai/test_lora_adapters.py -v - - cd examples - pip install tensorizer # for tensorizer test - python3 offline_inference/basic/generate.py --model facebook/opt-125m - # python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 offline_inference/basic/chat.py - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - ' + docker exec ${CONTAINER_ID} scripts/vllm/vllm_sagemaker_examples_test.sh vllm-sagemaker-endpoint-test: needs: [set-sagemaker-test-environment] diff --git a/scripts/vllm/vllm_0_10_2_test_setup.sh b/scripts/vllm/vllm_0_10_2_test_setup.sh new file mode 100755 index 000000000000..439f8033ddf9 --- /dev/null +++ b/scripts/vllm/vllm_0_10_2_test_setup.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -eux + +uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto +uv pip install --system pytest pytest-asyncio +uv pip install --system -e vllm_source/tests/vllm_test_utils +uv pip install --system hf_transfer +cd vllm_source +mkdir src +mv vllm src/vllm \ No newline at end of file diff --git a/scripts/vllm/vllm_0_12_0_test_setup.sh b/scripts/vllm/vllm_0_12_0_test_setup.sh new file mode 100755 index 000000000000..9a4a09b3fae4 --- /dev/null +++ b/scripts/vllm/vllm_0_12_0_test_setup.sh @@ -0,0 +1,13 @@ +#!/bin/bash +set -eux + +# delete old test dependencies file and regen +rm vllm_source/requirements/test.txt +uv pip compile vllm_source/requirements/test.in -o vllm_source/requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12 +uv pip install --system -r vllm_source/requirements/common.txt -r vllm_source/requirements/dev.txt --torch-backend=auto +uv pip install --system pytest pytest-asyncio +uv pip install --system -e vllm_source/tests/vllm_test_utils +uv pip install --system hf_transfer +cd vllm_source +mkdir src +mv vllm src/vllm \ No newline at end of file diff --git a/scripts/vllm/vllm_cuda_test.sh b/scripts/vllm/vllm_cuda_test.sh new file mode 100755 index 000000000000..ed0b2c0082df --- /dev/null +++ b/scripts/vllm/vllm_cuda_test.sh @@ -0,0 +1,7 @@ +#!/bin/bash +set -eux + +nvidia-smi +# Platform Tests (CUDA) # 4min +cd vllm_source/tests +pytest -v -s cuda/test_cuda_context.py \ No newline at end of file diff --git a/scripts/vllm/vllm_ec2_examples_test.sh b/scripts/vllm/vllm_ec2_examples_test.sh new file mode 100755 index 000000000000..ef7f3f0a8c8f --- /dev/null +++ b/scripts/vllm/vllm_ec2_examples_test.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -eux + +nvidia-smi +cd vllm_source/examples +pip install tensorizer # for tensorizer test +python3 offline_inference/basic/generate.py --model facebook/opt-125m +# python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 +python3 offline_inference/basic/chat.py +python3 offline_inference/prefix_caching.py +python3 offline_inference/llm_engine_example.py +python3 offline_inference/audio_language.py --seed 0 +python3 offline_inference/vision_language.py --seed 0 +python3 offline_inference/vision_language_pooling.py --seed 0 +python3 offline_inference/vision_language_multi_image.py --seed 0 +python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors +python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 +python3 offline_inference/basic/classify.py +python3 offline_inference/basic/embed.py +python3 offline_inference/basic/score.py +python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 +# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU +python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 \ No newline at end of file diff --git a/scripts/vllm/vllm_rayserve_examples_test.sh b/scripts/vllm/vllm_rayserve_examples_test.sh new file mode 100755 index 000000000000..39de96c4e420 --- /dev/null +++ b/scripts/vllm/vllm_rayserve_examples_test.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -eux + +nvidia-smi + +# Examples Test # 30min +cd vllm_source/examples +pip install tensorizer # for tensorizer test +python3 offline_inference/basic/generate.py --model facebook/opt-125m +# python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 +python3 offline_inference/basic/chat.py +python3 offline_inference/prefix_caching.py +python3 offline_inference/llm_engine_example.py + +# NOTE: Change in Ultravox model changed the class of a audio_processor https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e +# vLLM created a fix here https://github.com/vllm-project/vllm/pull/29588 but it is not consumed in vLLM<=0.11 +# python3 offline_inference/audio_language.py --seed 0 + +python3 offline_inference/vision_language.py --seed 0 +# broken before v0.12.0: https://github.com/vllm-project/vllm/commit/c64c0b78de4716ef019666663c56b6ceaa019463 +# python3 offline_inference/vision_language_pooling.py --seed +# python3 offline_inference/vision_language_multi_image.py --seed 0 +python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors +python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 +python3 offline_inference/basic/classify.py +python3 offline_inference/basic/embed.py +python3 offline_inference/basic/score.py +VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 \ No newline at end of file diff --git a/scripts/vllm/vllm_regression_test.sh b/scripts/vllm/vllm_regression_test.sh new file mode 100755 index 000000000000..8de728dbd9d0 --- /dev/null +++ b/scripts/vllm/vllm_regression_test.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -eux +nvidia-smi + +# Regression Test # 7min +cd vllm_source/tests +uv pip install --system modelscope +pytest -v -s test_regression.py \ No newline at end of file diff --git a/scripts/vllm/vllm_sagemaker_examples_test.sh b/scripts/vllm/vllm_sagemaker_examples_test.sh new file mode 100755 index 000000000000..bed45806d05a --- /dev/null +++ b/scripts/vllm/vllm_sagemaker_examples_test.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -eux +nvidia-smi + +cd vllm_source + +# Test LoRA adapter loading/unloading via SageMaker endpoints +pytest tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py -v + +# Test stateful session management +pytest tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py -v + +# Test sagemaker custom middleware +pytest tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py -v + +# Test sagemaker endpoint overrides +pytest tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py -v + +# Test LoRA adapter loading/unloading via original OpenAI API server endpoints +pytest tests/entrypoints/openai/test_lora_adapters.py -v + +cd examples +pip install tensorizer # for tensorizer test +python3 offline_inference/basic/generate.py --model facebook/opt-125m +# python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 +python3 offline_inference/basic/chat.py +python3 offline_inference/prefix_caching.py +python3 offline_inference/llm_engine_example.py +python3 offline_inference/audio_language.py --seed 0 +python3 offline_inference/vision_language.py --seed 0 +python3 offline_inference/vision_language_pooling.py --seed 0 +python3 offline_inference/vision_language_multi_image.py --seed 0 +python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors +python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 +python3 offline_inference/basic/classify.py +python3 offline_inference/basic/embed.py +python3 offline_inference/basic/score.py +python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 +# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU +python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 \ No newline at end of file