From 7dc430dc82a219a8a2ecb7f38df94917d1237718 Mon Sep 17 00:00:00 2001 From: Eugene Clark Date: Thu, 5 Jun 2025 16:16:22 -0400 Subject: [PATCH 1/2] Integrate Hackathon vrs-in-a-box into vrs-python --- vrs-in-a-box/.gitignore | 2 + vrs-in-a-box/Dockerfile | 57 ++++++++++++++++++++++++++ vrs-in-a-box/README.md | 62 ++++++++++++++++++++++++++++ vrs-in-a-box/VrsVcfAnnotator.wdl | 43 ++++++++++++++++++++ vrs-in-a-box/run.sh | 2 + vrs-in-a-box/seqrepo.sh | 69 ++++++++++++++++++++++++++++++++ 6 files changed, 235 insertions(+) create mode 100644 vrs-in-a-box/.gitignore create mode 100644 vrs-in-a-box/Dockerfile create mode 100644 vrs-in-a-box/README.md create mode 100644 vrs-in-a-box/VrsVcfAnnotator.wdl create mode 100755 vrs-in-a-box/run.sh create mode 100755 vrs-in-a-box/seqrepo.sh diff --git a/vrs-in-a-box/.gitignore b/vrs-in-a-box/.gitignore new file mode 100644 index 00000000..6aeca283 --- /dev/null +++ b/vrs-in-a-box/.gitignore @@ -0,0 +1,2 @@ +seqrepo-* +GCF_* diff --git a/vrs-in-a-box/Dockerfile b/vrs-in-a-box/Dockerfile new file mode 100644 index 00000000..6d2a4e87 --- /dev/null +++ b/vrs-in-a-box/Dockerfile @@ -0,0 +1,57 @@ +# Builder image +FROM python:3.12-slim AS build + +# Either 'GRCh38' or 'GRCh37' +ARG ASSEMBLY="GRCh38" +ARG VRS_PYTHON_VERSION="2.1.2" + +# Install packages needed for the build +RUN apt-get update && apt-get upgrade -y && apt-get install -y \ + curl \ + git \ + libpq-dev \ + python3-pip \ + python3-venv \ + rsync \ + zlib1g-dev \ + postgresql \ + unzip \ + ; + +WORKDIR /vrs-python + +# Setup the virtual env for vrs-python +RUN python3 -m venv /vrs-python/venv +ENV PATH=/vrs-python/venv/bin:$PATH + +# Install vrs-python +RUN /vrs-python/venv/bin/python3 -m pip install -U setuptools +RUN /vrs-python/venv/bin/python3 -m pip install ga4gh.vrs[extras]==${VRS_PYTHON_VERSION} + +# Download and unpack seqrepo files +COPY /seqrepo-${ASSEMBLY}.zip / +RUN unzip /seqrepo-${ASSEMBLY}.zip -d / + +# Final image +FROM python:3.12-slim AS vrs-python +ARG ASSEMBLY +ENV ASSEMBLY=${ASSEMBLY} + +# Install runtime required packages +RUN apt-get update && apt-get install -y libpq-dev + +# Copy over artifacts from the builder +COPY --from=build /vrs-python /vrs-python +COPY --from=build /seqrepo-${ASSEMBLY} /seqrepo-${ASSEMBLY} + +# Copy over run script +COPY ./run.sh /run.sh + +# Set environment variables +ENV GA4GH_VRS_DATAPROXY_URI="seqrepo+file:///seqrepo-${ASSEMBLY}/master" +ENV VIRTUAL_ENV=/vrs-python/venv +ENV PATH=/vrs-python/venv/bin:$PATH + +WORKDIR / + +ENTRYPOINT [ "/run.sh" ] diff --git a/vrs-in-a-box/README.md b/vrs-in-a-box/README.md new file mode 100644 index 00000000..87a337bb --- /dev/null +++ b/vrs-in-a-box/README.md @@ -0,0 +1,62 @@ +# VRS-in-a-Box VCF Annotator for Assembly GRCh37 or GRCh38 +VRS-in-a-Box is a single Docker image that is able to annotate a VCF with VRS IDs +using the `vrs-annotate` tool that is part of `vrs-python`. The Docker image includes +all the dependencies required for VRS ID computation of genomic variants on one of the +assembled chromosomes for a specific reference assembly. + +The Docker image is kept to a minimum by creating an instance of `seqrepo` that +contains only the assembled chromosomes for a single reference assembly. + +Prebuilt images are available in Docker Hub: https://hub.docker.com/u/ga4gh + +## Using VRS-in-a-Box in Terra +VRS-in-a-Box can be easily added to a workflow in Terra to annotate a VCF file with +VRS IDs. The `VrsVcfAnnotator.wdl` file contains a simple workflow with one task +that will annotate a VCF file using the pre-built images in Docker Hub. + +## Building VRS-in-a-Box +The following instructions describe how to build a VRS-in-a-Box image from scratch. + + +#### Create the Build Environment +Install any [prerequisites](https://github.com/biocommons/biocommons.seqrepo#requirements) +needed for `seqrepo` and create a Python virtual environment. +```bash +python -m venv venv +source venv/bin/activate +pip install biocommons.seqrepo +``` + +#### Download the Reference Assembly Sequences +```bash +# GRCh38 +curl -O https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.26_GRCh38/GCF_000001405.26_GRCh38_genomic.fna.gz +# GRCh37 +curl -O https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.13_GRCh37/GCF_000001405.13_GRCh37_genomic.fna.gz +``` + +#### Build Seqrepo +```bash +# GRCh38 +bash seqrepo.sh GCF_000001405.26_GRCh38_genomic.fna.gz GRCh38 +# GRCh37 +bash seqrepo.sh GCF_000001405.26_GRCh37_genomic.fna.gz GRCh37 +``` + +## Build Images for Each Assembly +```shell +# GRCh38 +docker build --build-arg ASSEMBLY=GRCh38 --build-arg VRS_PYTHON_VERSION=2.1.2 -t ga4gh/vrs-vcf-annotator-grch38:2.1.2 . +# GRCh37 +docker build --build-arg ASSEMBLY=GRCh37 --build-arg VRS_PYTHON_VERSION=2.1.2 -t ga4gh/vrs-vcf-annotator-grch37:2.1.2 . +``` + +## Running the Image in Docker +Run the image to annotate the VCF file `NA12878.vcf` in the current directory: +```shell +docker run -it --rm -v $(pwd):/input ga4gh/vrs-vcf-annotator-grch38:2.1.2 /input/NA12878.vcf --vcf-out /input/NA12878_with_vrs.vcf +``` +Run the image to annotate the VCF file `NA12878.vcf` in the current directory and capture the VRS objects in a separate file: +```shell +docker run -it --rm -v $(pwd):/input ga4gh/vrs-vcf-annotator-grch38:2.1.2 /input/NA12878.vcf --vcf-out /input/NA12878_with_vrs.vcf --ndjson-out /input/vrs-objects.json +``` diff --git a/vrs-in-a-box/VrsVcfAnnotator.wdl b/vrs-in-a-box/VrsVcfAnnotator.wdl new file mode 100644 index 00000000..3ef4d034 --- /dev/null +++ b/vrs-in-a-box/VrsVcfAnnotator.wdl @@ -0,0 +1,43 @@ +version 1.0 + +workflow VrsVcfAnnotator { + input { + File vcf_file + String reference_assembly = "GRCh38" + } + + call VrsVcfAnnotatorTask { + input: + vcf_file = vcf_file, + reference_assembly = reference_assembly + } + + output { + File output_vcf_file = VrsVcfAnnotatorTask.output_vcf_file + File output_vrs_objects = VrsVcfAnnotatorTask.output_vrs_objects + } + +} + +task VrsVcfAnnotatorTask { + input { + File vcf_file + String reference_assembly = "GRCh38" + } + + String ref_asm_lc = reference_assembly.toLower() + + command <<< + vrs-annotate vcf --assembly ~{reference_assembly} "~{vcf_file}" --vcf-out "with_vrs_ids.vcf" --ndjson-out "vrs_objects.json" + >>> + + runtime { + docker: "ga4gh/vrs-vcf-annotator-~{ref_asm_lc}:2.1.2" + memory: "4GB" + } + + output { + File output_vcf_file = "with_vrs_ids.vcf" + File output_vrs_objects = "vrs_objects.json" + } +} diff --git a/vrs-in-a-box/run.sh b/vrs-in-a-box/run.sh new file mode 100755 index 00000000..1a745408 --- /dev/null +++ b/vrs-in-a-box/run.sh @@ -0,0 +1,2 @@ +#!/bin/sh +/vrs-python/venv/bin/vrs-annotate vcf --assembly ${ASSEMBLY} $@ diff --git a/vrs-in-a-box/seqrepo.sh b/vrs-in-a-box/seqrepo.sh new file mode 100755 index 00000000..7c87da64 --- /dev/null +++ b/vrs-in-a-box/seqrepo.sh @@ -0,0 +1,69 @@ +#!/bin/bash +set -e # Exit on any error + + +# Check if an argument is provided +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +FASTA_FILE=$1 +ASSEMBLY_NAME=$2 + +export SEQREPO_ROOT=seqrepo-$ASSEMBLY_NAME + +# Check if the file exists +if [ ! -f "$FASTA_FILE" ]; then + echo "ERROR: File '$FASTA_FILE' not found!" 1>&2 + exit 1 +fi + +# Check if openrsync is in the output of /usr/bin/rsync --version +RSYNC_EXE=${RSYNC:-rsync} +if $RSYNC_EXE --version | grep -q "openrsync"; then + if [ -x "/opt/homebrew/bin/rsync" ]; then + RSYNC_OPTION="--rsync-exe /opt/homebrew/bin/rsync" + else + echo "ERROR: seqrepo requires 'rsync' (not 'openrsync') to run." 1>&2 + echo " On MacOS rsync can be installed with Homebrew." 1>&2 + echo " Set the RSYNC environment variable to specify the rsync executable to use." 1>&2 + exit 1 + fi +else + RSYNC_OPTION="--rsync-exe $RSYNC_EXE" +fi + +# Initialize seqrepo +echo "Initializing seqrepo in $SEQREPO_ROOT..." +seqrepo -r seqrepo --root-directory $SEQREPO_ROOT $RSYNC_OPTION init + +# Load the provided FASTA file +echo "Loading FASTA file: $FASTA_FILE" +seqrepo -r seqrepo --root-directory $SEQREPO_ROOT $RSYNC_OPTION load "$FASTA_FILE" -n NCBI + +# Add assembly names +echo "Adding assembly names..." +seqrepo -r seqrepo --root-directory $SEQREPO_ROOT $RSYNC_OPTION add-assembly-names + +echo "Initial seqrepo build completed" + +# vrs-annotate checks that GRCh38:1 is in the database as a sanity check +# so for non-GRCh38, we have to add a dummy record so it will run +if [ "$ASSEMBLY_NAME" != "GRCh38" ]; then +sqlite3 $SEQREPO_ROOT/master/aliases.sqlite3 < Date: Thu, 24 Jul 2025 15:17:06 -0400 Subject: [PATCH 2/2] Remove WDL example --- vrs-in-a-box/README.md | 5 ---- vrs-in-a-box/VrsVcfAnnotator.wdl | 43 -------------------------------- 2 files changed, 48 deletions(-) delete mode 100644 vrs-in-a-box/VrsVcfAnnotator.wdl diff --git a/vrs-in-a-box/README.md b/vrs-in-a-box/README.md index 87a337bb..9185246e 100644 --- a/vrs-in-a-box/README.md +++ b/vrs-in-a-box/README.md @@ -9,11 +9,6 @@ contains only the assembled chromosomes for a single reference assembly. Prebuilt images are available in Docker Hub: https://hub.docker.com/u/ga4gh -## Using VRS-in-a-Box in Terra -VRS-in-a-Box can be easily added to a workflow in Terra to annotate a VCF file with -VRS IDs. The `VrsVcfAnnotator.wdl` file contains a simple workflow with one task -that will annotate a VCF file using the pre-built images in Docker Hub. - ## Building VRS-in-a-Box The following instructions describe how to build a VRS-in-a-Box image from scratch. diff --git a/vrs-in-a-box/VrsVcfAnnotator.wdl b/vrs-in-a-box/VrsVcfAnnotator.wdl deleted file mode 100644 index 3ef4d034..00000000 --- a/vrs-in-a-box/VrsVcfAnnotator.wdl +++ /dev/null @@ -1,43 +0,0 @@ -version 1.0 - -workflow VrsVcfAnnotator { - input { - File vcf_file - String reference_assembly = "GRCh38" - } - - call VrsVcfAnnotatorTask { - input: - vcf_file = vcf_file, - reference_assembly = reference_assembly - } - - output { - File output_vcf_file = VrsVcfAnnotatorTask.output_vcf_file - File output_vrs_objects = VrsVcfAnnotatorTask.output_vrs_objects - } - -} - -task VrsVcfAnnotatorTask { - input { - File vcf_file - String reference_assembly = "GRCh38" - } - - String ref_asm_lc = reference_assembly.toLower() - - command <<< - vrs-annotate vcf --assembly ~{reference_assembly} "~{vcf_file}" --vcf-out "with_vrs_ids.vcf" --ndjson-out "vrs_objects.json" - >>> - - runtime { - docker: "ga4gh/vrs-vcf-annotator-~{ref_asm_lc}:2.1.2" - memory: "4GB" - } - - output { - File output_vcf_file = "with_vrs_ids.vcf" - File output_vrs_objects = "vrs_objects.json" - } -}