duckdblabs · Tmonster · Oct 10, 2025 · Oct 12, 2025 · Oct 27, 2025 · Oct 27, 2025
diff --git a/.github/workflows/PublishReport.yml b/.github/workflows/PublishReport.yml
@@ -0,0 +1,108 @@
+name: Publish report
+on:
+  workflow_dispatch:
+    inputs:
+      instance_id:
+        type: string
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/master' || github.sha }}
+  cancel-in-progress: true
+
+env:
+  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  gh_issue_repo: duckdblabs/db-benchmark
+  instance_id: ${{ inputs.instance_id }}
+  solutions: ${{ inputs.solutions }}
+
+
+jobs:
+  start-aws-machine:
+    name: Start aws-small-machine
+    runs-on: ubuntu-latest
+    environment: aws-secrets
+    steps:
+      - name: Start EC2 runner
+        shell: bash
+        env:
+          AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY_ID}}
+          AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}}
+          AWS_DEFAULT_REGION: us-east-1
+        run: aws ec2 start-instances --instance-id ${{ env.instance_id }}
+
+      - name: Create issue if failure
+        shell: bash
+        if: ${{ failure() && contains(github.ref_name, 'main') }}
+        run: |
+          gh issue create --repo ${{ env.gh_issue_repo }} --title "Could not start DB-benchmark machine" --body "AWS box with instance-id ${{ env.instance_id }} could not be started"
+
+  run-benchmark:
+    name: Generate Assets
+    env:
+      CC: gcc-10
+      CXX: g++-10
+      GEN: ninja
+    runs-on: report-generator
+    environment: aws-secrets
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: run mount
+        shell: bash
+        run: |
+          ./_setup_utils/mount.sh
+
+      - name: Setup git commit
+        shell: bash
+        working-directory: /var/lib/mount/db-benchmark-metal
+        run: |
+          git config --global user.email "Tmonster <[email protected]>"
+          git config --global user.name "Publish report action"
+
+      - name: Download the data
+        shell: bash
+        working-directory: /var/lib/mount/db-benchmark-metal
+        env:
+          DO_REPORT: 1
+          DO_PUBLISH: 1
+        run: |
+          ./_run/download_small_medium.sh
+          ./_run/download_large_data.sh
+          ./_run/generate_report.sh
+          ./report/publish.sh
+
+      # if something doesn't work, upload the assets
+      - name: Create Archive
+        if: always()
+        shell: bash
+        working-directory: /var/lib/mount/db-benchmark-metal
+        run: |
+          mkdir -p out
+          echo "guarantee not empty dir" > out/guarantee.txt
+          zip -r out-dir.zip out/ public/
+
+      - uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: out-dir.zip
+          path: /var/lib/mount/db-benchmark-metal/out-dir.zip
+          if-no-files-found: error
+
+  shutdown:
+    name: shut down
+    environment: aws-secrets
+    if: always()
+    runs-on: ubuntu-latest
+    needs:
+      - start-aws-machine
+      - run-benchmark
+
+    steps:
+      - name: shutdown
+        shell: bash
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          AWS_DEFAULT_REGION: us-east-1
+        run: aws ec2 stop-instances --instance-id ${{ env.instance_id }}
+
diff --git a/_run/download_large_data.sh b/_run/download_large_data.sh
@@ -0,0 +1,27 @@
+# download and expand large data
+
+# get groupby large (50GB datasets)
+if [ ! -f data/groupby_large.duckdb ]; then
+	aws s3 cp s3://duckdb-data-for-ec2-regression-tests/db-benchmark-data/groupby_large.duckdb data/groupby_large.duckdb --quiet
+fi
+
+# get join large (50GB datasets)
+if [ ! -f data/join_large.duckdb ]; then
+	aws s3 cp s3://duckdb-data-for-ec2-regression-tests/db-benchmark-data/join_large.duckdb data/join_large.duckdb --quiet
+fi
+
+
+# expand groupby-large datasets to csv
+duckdb data/groupby_large.duckdb  -c "copy G1_1e9_1e2_0_0 to 'data/G1_1e9_1e2_0_0.csv' (FORMAT CSV)"
+duckdb data/groupby_large.duckdb  -c "copy G1_1e9_1e1_0_0 to 'data/G1_1e9_1e1_0_0.csv' (FORMAT CSV)"
+duckdb data/groupby_large.duckdb  -c "copy G1_1e9_2e0_0_0 to 'data/G1_1e9_2e0_0_0.csv' (FORMAT CSV)"
+duckdb data/groupby_large.duckdb  -c "copy G1_1e9_1e2_0_1 to 'data/G1_1e9_1e2_0_1.csv' (FORMAT CSV)"
+duckdb data/groupby_large.duckdb  -c "copy G1_1e9_1e2_5_0 to 'data/G1_1e9_1e2_5_0.csv' (FORMAT CSV)"
+
+
+# expand join-large datasets to csv
+duckdb data/join_large.duckdb  -c "copy J1_1e9_NA_0_0 to 'data/J1_1e9_NA_0_0.csv' (FORMAT CSV)"
+duckdb data/join_large.duckdb  -c "copy J1_1e9_1e9_0_0 to 'data/J1_1e9_1e9_0_0.csv' (FORMAT CSV)"
+duckdb data/join_large.duckdb  -c "copy J1_1e9_1e6_0_0 to 'data/J1_1e9_1e6_0_0.csv' (FORMAT CSV)"
+duckdb data/join_large.duckdb  -c "copy J1_1e9_1e3_0_0 to 'data/J1_1e9_1e3_0_0.csv' (FORMAT CSV)"
+
diff --git a/_run/generate_report.sh b/_run/generate_report.sh
@@ -0,0 +1,4 @@
+$DO_REPORT && echo "# Rendering report"
+$DO_REPORT && Rscript -e 'rmarkdown::render("./_report/index.Rmd", output_dir="public")' > ./out/rmarkdown_index.out 2>&1 && echo "# Benchmark index report produced"
+$DO_REPORT && Rscript -e 'rmarkdown::render("./_report/history.Rmd", output_dir="public")' > ./out/rmarkdown_history.out 2>&1 && echo "# Benchmark history report produced"
+$DO_REPORT && Rscript -e 'rmarkdown::render("./_report/tech.Rmd", output_dir="public")' > ./out/rmarkdown_tech.out 2>&1 && echo "# Benchmark tech report produced"
diff --git a/_run/run_large.sh b/_run/run_large.sh
@@ -3,26 +3,7 @@
 rm data/*.csv
 rm data/*.duckdb
 
-# get groupby large (50GB datasets)
-aws s3 cp s3://duckdb-data-for-ec2-regression-tests/db-benchmark-data/groupby_large.duckdb data/groupby_large.duckdb --quiet
-# get join small (50GB datasets)
-aws s3 cp s3://duckdb-data-for-ec2-regression-tests/db-benchmark-data/join_large.duckdb data/join_large.duckdb --quiet
-
-
-# expand groupby-large datasets to csv
-duckdb data/groupby_large.duckdb  -c "copy G1_1e9_1e2_0_0 to 'data/G1_1e9_1e2_0_0.csv' (FORMAT CSV)"
-duckdb data/groupby_large.duckdb  -c "copy G1_1e9_1e1_0_0 to 'data/G1_1e9_1e1_0_0.csv' (FORMAT CSV)"
-duckdb data/groupby_large.duckdb  -c "copy G1_1e9_2e0_0_0 to 'data/G1_1e9_2e0_0_0.csv' (FORMAT CSV)"
-duckdb data/groupby_large.duckdb  -c "copy G1_1e9_1e2_0_1 to 'data/G1_1e9_1e2_0_1.csv' (FORMAT CSV)"
-duckdb data/groupby_large.duckdb  -c "copy G1_1e9_1e2_5_0 to 'data/G1_1e9_1e2_5_0.csv' (FORMAT CSV)"
-
-
-# expand join-large datasets to csv
-duckdb data/join_large.duckdb  -c "copy J1_1e9_NA_0_0 to 'data/J1_1e9_NA_0_0.csv' (FORMAT CSV)"
-duckdb data/join_large.duckdb  -c "copy J1_1e9_1e9_0_0 to 'data/J1_1e9_1e9_0_0.csv' (FORMAT CSV)"
-duckdb data/join_large.duckdb  -c "copy J1_1e9_1e6_0_0 to 'data/J1_1e9_1e6_0_0.csv' (FORMAT CSV)"
-duckdb data/join_large.duckdb  -c "copy J1_1e9_1e3_0_0 to 'data/J1_1e9_1e3_0_0.csv' (FORMAT CSV)"
-
+./_run/download_large_data.sh
 
 cp _control/data_large.csv _control/data.csv
 

diff --git a/run.sh b/run.sh
@@ -102,10 +102,7 @@ if [[ -f ./stop ]]; then echo "# Benchmark run $BATCH has been interrupted after
 # publish report for all tasks
 rm -rf ./public
 rm -f ./report-done
-$DO_REPORT && echo "# Rendering report"
-$DO_REPORT && Rscript -e 'rmarkdown::render("./_report/index.Rmd", output_dir="public")' > ./out/rmarkdown_index.out 2>&1 && echo "# Benchmark index report produced"
-$DO_REPORT && Rscript -e 'rmarkdown::render("./_report/history.Rmd", output_dir="public")' > ./out/rmarkdown_history.out 2>&1 && echo "# Benchmark history report produced"
-$DO_REPORT && Rscript -e 'rmarkdown::render("./_report/tech.Rmd", output_dir="public")' > ./out/rmarkdown_tech.out 2>&1 && echo "# Benchmark tech report produced"
+._run/generate_report.sh
 
 # publish benchmark, only if all reports successfully generated (logged in ./report-done file), and token file exists
 if [[ -f ./stop ]]; then echo "# Benchmark run $BATCH has been interrupted after $(($(date +%s)-$BATCH))s due to 'stop' file" && rm -f ./stop && rm -f ./run.lock && exit; fi;