diff --git a/.github/workflows/PublishReport.yml b/.github/workflows/PublishReport.yml new file mode 100644 index 00000000..31df06fd --- /dev/null +++ b/.github/workflows/PublishReport.yml @@ -0,0 +1,108 @@ +name: Publish report +on: + workflow_dispatch: + inputs: + instance_id: + type: string + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/master' || github.sha }} + cancel-in-progress: true + +env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + gh_issue_repo: duckdblabs/db-benchmark + instance_id: ${{ inputs.instance_id }} + solutions: ${{ inputs.solutions }} + + +jobs: + start-aws-machine: + name: Start aws-small-machine + runs-on: ubuntu-latest + environment: aws-secrets + steps: + - name: Start EC2 runner + shell: bash + env: + AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY_ID}} + AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}} + AWS_DEFAULT_REGION: us-east-1 + run: aws ec2 start-instances --instance-id ${{ env.instance_id }} + + - name: Create issue if failure + shell: bash + if: ${{ failure() && contains(github.ref_name, 'main') }} + run: | + gh issue create --repo ${{ env.gh_issue_repo }} --title "Could not start DB-benchmark machine" --body "AWS box with instance-id ${{ env.instance_id }} could not be started" + + run-benchmark: + name: Generate Assets + env: + CC: gcc-10 + CXX: g++-10 + GEN: ninja + runs-on: report-generator + environment: aws-secrets + steps: + - uses: actions/checkout@v4 + + - name: run mount + shell: bash + run: | + ./_setup_utils/mount.sh + + - name: Setup git commit + shell: bash + working-directory: /var/lib/mount/db-benchmark-metal + run: | + git config --global user.email "Tmonster " + git config --global user.name "Publish report action" + + - name: Download the data + shell: bash + working-directory: /var/lib/mount/db-benchmark-metal + env: + DO_REPORT: 1 + DO_PUBLISH: 1 + run: | + ./_run/download_small_medium.sh + ./_run/download_large_data.sh + ./_run/generate_report.sh + ./report/publish.sh + + # if something doesn't work, upload the assets + - name: Create Archive + if: always() + shell: bash + working-directory: /var/lib/mount/db-benchmark-metal + run: | + mkdir -p out + echo "guarantee not empty dir" > out/guarantee.txt + zip -r out-dir.zip out/ public/ + + - uses: actions/upload-artifact@v4 + if: always() + with: + name: out-dir.zip + path: /var/lib/mount/db-benchmark-metal/out-dir.zip + if-no-files-found: error + + shutdown: + name: shut down + environment: aws-secrets + if: always() + runs-on: ubuntu-latest + needs: + - start-aws-machine + - run-benchmark + + steps: + - name: shutdown + shell: bash + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: us-east-1 + run: aws ec2 stop-instances --instance-id ${{ env.instance_id }} + diff --git a/_run/download_large_data.sh b/_run/download_large_data.sh new file mode 100644 index 00000000..a3d46c90 --- /dev/null +++ b/_run/download_large_data.sh @@ -0,0 +1,27 @@ +# download and expand large data + +# get groupby large (50GB datasets) +if [ ! -f data/groupby_large.duckdb ]; then + aws s3 cp s3://duckdb-data-for-ec2-regression-tests/db-benchmark-data/groupby_large.duckdb data/groupby_large.duckdb --quiet +fi + +# get join large (50GB datasets) +if [ ! -f data/join_large.duckdb ]; then + aws s3 cp s3://duckdb-data-for-ec2-regression-tests/db-benchmark-data/join_large.duckdb data/join_large.duckdb --quiet +fi + + +# expand groupby-large datasets to csv +duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_0_0 to 'data/G1_1e9_1e2_0_0.csv' (FORMAT CSV)" +duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e1_0_0 to 'data/G1_1e9_1e1_0_0.csv' (FORMAT CSV)" +duckdb data/groupby_large.duckdb -c "copy G1_1e9_2e0_0_0 to 'data/G1_1e9_2e0_0_0.csv' (FORMAT CSV)" +duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_0_1 to 'data/G1_1e9_1e2_0_1.csv' (FORMAT CSV)" +duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_5_0 to 'data/G1_1e9_1e2_5_0.csv' (FORMAT CSV)" + + +# expand join-large datasets to csv +duckdb data/join_large.duckdb -c "copy J1_1e9_NA_0_0 to 'data/J1_1e9_NA_0_0.csv' (FORMAT CSV)" +duckdb data/join_large.duckdb -c "copy J1_1e9_1e9_0_0 to 'data/J1_1e9_1e9_0_0.csv' (FORMAT CSV)" +duckdb data/join_large.duckdb -c "copy J1_1e9_1e6_0_0 to 'data/J1_1e9_1e6_0_0.csv' (FORMAT CSV)" +duckdb data/join_large.duckdb -c "copy J1_1e9_1e3_0_0 to 'data/J1_1e9_1e3_0_0.csv' (FORMAT CSV)" + diff --git a/_run/generate_report.sh b/_run/generate_report.sh new file mode 100755 index 00000000..ddb2f6f7 --- /dev/null +++ b/_run/generate_report.sh @@ -0,0 +1,4 @@ +$DO_REPORT && echo "# Rendering report" +$DO_REPORT && Rscript -e 'rmarkdown::render("./_report/index.Rmd", output_dir="public")' > ./out/rmarkdown_index.out 2>&1 && echo "# Benchmark index report produced" +$DO_REPORT && Rscript -e 'rmarkdown::render("./_report/history.Rmd", output_dir="public")' > ./out/rmarkdown_history.out 2>&1 && echo "# Benchmark history report produced" +$DO_REPORT && Rscript -e 'rmarkdown::render("./_report/tech.Rmd", output_dir="public")' > ./out/rmarkdown_tech.out 2>&1 && echo "# Benchmark tech report produced" \ No newline at end of file diff --git a/_run/run_large.sh b/_run/run_large.sh index eb88089c..c18230a3 100755 --- a/_run/run_large.sh +++ b/_run/run_large.sh @@ -3,26 +3,7 @@ rm data/*.csv rm data/*.duckdb -# get groupby large (50GB datasets) -aws s3 cp s3://duckdb-data-for-ec2-regression-tests/db-benchmark-data/groupby_large.duckdb data/groupby_large.duckdb --quiet -# get join small (50GB datasets) -aws s3 cp s3://duckdb-data-for-ec2-regression-tests/db-benchmark-data/join_large.duckdb data/join_large.duckdb --quiet - - -# expand groupby-large datasets to csv -duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_0_0 to 'data/G1_1e9_1e2_0_0.csv' (FORMAT CSV)" -duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e1_0_0 to 'data/G1_1e9_1e1_0_0.csv' (FORMAT CSV)" -duckdb data/groupby_large.duckdb -c "copy G1_1e9_2e0_0_0 to 'data/G1_1e9_2e0_0_0.csv' (FORMAT CSV)" -duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_0_1 to 'data/G1_1e9_1e2_0_1.csv' (FORMAT CSV)" -duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_5_0 to 'data/G1_1e9_1e2_5_0.csv' (FORMAT CSV)" - - -# expand join-large datasets to csv -duckdb data/join_large.duckdb -c "copy J1_1e9_NA_0_0 to 'data/J1_1e9_NA_0_0.csv' (FORMAT CSV)" -duckdb data/join_large.duckdb -c "copy J1_1e9_1e9_0_0 to 'data/J1_1e9_1e9_0_0.csv' (FORMAT CSV)" -duckdb data/join_large.duckdb -c "copy J1_1e9_1e6_0_0 to 'data/J1_1e9_1e6_0_0.csv' (FORMAT CSV)" -duckdb data/join_large.duckdb -c "copy J1_1e9_1e3_0_0 to 'data/J1_1e9_1e3_0_0.csv' (FORMAT CSV)" - +./_run/download_large_data.sh cp _control/data_large.csv _control/data.csv diff --git a/run.sh b/run.sh index f17a5644..743c55c2 100755 --- a/run.sh +++ b/run.sh @@ -102,10 +102,7 @@ if [[ -f ./stop ]]; then echo "# Benchmark run $BATCH has been interrupted after # publish report for all tasks rm -rf ./public rm -f ./report-done -$DO_REPORT && echo "# Rendering report" -$DO_REPORT && Rscript -e 'rmarkdown::render("./_report/index.Rmd", output_dir="public")' > ./out/rmarkdown_index.out 2>&1 && echo "# Benchmark index report produced" -$DO_REPORT && Rscript -e 'rmarkdown::render("./_report/history.Rmd", output_dir="public")' > ./out/rmarkdown_history.out 2>&1 && echo "# Benchmark history report produced" -$DO_REPORT && Rscript -e 'rmarkdown::render("./_report/tech.Rmd", output_dir="public")' > ./out/rmarkdown_tech.out 2>&1 && echo "# Benchmark tech report produced" +._run/generate_report.sh # publish benchmark, only if all reports successfully generated (logged in ./report-done file), and token file exists if [[ -f ./stop ]]; then echo "# Benchmark run $BATCH has been interrupted after $(($(date +%s)-$BATCH))s due to 'stop' file" && rm -f ./stop && rm -f ./run.lock && exit; fi;