Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions vrs-in-a-box/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
seqrepo-*
GCF_*
57 changes: 57 additions & 0 deletions vrs-in-a-box/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Builder image
FROM python:3.12-slim AS build

# Either 'GRCh38' or 'GRCh37'
ARG ASSEMBLY="GRCh38"
ARG VRS_PYTHON_VERSION="2.1.2"

# Install packages needed for the build
RUN apt-get update && apt-get upgrade -y && apt-get install -y \
curl \
git \
libpq-dev \
python3-pip \
python3-venv \
rsync \
zlib1g-dev \
postgresql \
unzip \
;

WORKDIR /vrs-python

# Setup the virtual env for vrs-python
RUN python3 -m venv /vrs-python/venv
ENV PATH=/vrs-python/venv/bin:$PATH

# Install vrs-python
RUN /vrs-python/venv/bin/python3 -m pip install -U setuptools
RUN /vrs-python/venv/bin/python3 -m pip install ga4gh.vrs[extras]==${VRS_PYTHON_VERSION}

# Download and unpack seqrepo files
COPY /seqrepo-${ASSEMBLY}.zip /
RUN unzip /seqrepo-${ASSEMBLY}.zip -d /

# Final image
FROM python:3.12-slim AS vrs-python
ARG ASSEMBLY
ENV ASSEMBLY=${ASSEMBLY}

# Install runtime required packages
RUN apt-get update && apt-get install -y libpq-dev

# Copy over artifacts from the builder
COPY --from=build /vrs-python /vrs-python
COPY --from=build /seqrepo-${ASSEMBLY} /seqrepo-${ASSEMBLY}

# Copy over run script
COPY ./run.sh /run.sh

# Set environment variables
ENV GA4GH_VRS_DATAPROXY_URI="seqrepo+file:///seqrepo-${ASSEMBLY}/master"
ENV VIRTUAL_ENV=/vrs-python/venv
ENV PATH=/vrs-python/venv/bin:$PATH

WORKDIR /

ENTRYPOINT [ "/run.sh" ]
57 changes: 57 additions & 0 deletions vrs-in-a-box/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# VRS-in-a-Box VCF Annotator for Assembly GRCh37 or GRCh38
VRS-in-a-Box is a single Docker image that is able to annotate a VCF with VRS IDs
using the `vrs-annotate` tool that is part of `vrs-python`. The Docker image includes
all the dependencies required for VRS ID computation of genomic variants on one of the
assembled chromosomes for a specific reference assembly.

The Docker image is kept to a minimum by creating an instance of `seqrepo` that
contains only the assembled chromosomes for a single reference assembly.

Prebuilt images are available in Docker Hub: https://hub.docker.com/u/ga4gh

## Building VRS-in-a-Box
The following instructions describe how to build a VRS-in-a-Box image from scratch.


#### Create the Build Environment
Install any [prerequisites](https://github.com/biocommons/biocommons.seqrepo#requirements)
needed for `seqrepo` and create a Python virtual environment.
```bash
python -m venv venv
source venv/bin/activate
pip install biocommons.seqrepo
```

#### Download the Reference Assembly Sequences
```bash
# GRCh38
curl -O https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.26_GRCh38/GCF_000001405.26_GRCh38_genomic.fna.gz
# GRCh37
curl -O https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.13_GRCh37/GCF_000001405.13_GRCh37_genomic.fna.gz
```

#### Build Seqrepo
```bash
# GRCh38
bash seqrepo.sh GCF_000001405.26_GRCh38_genomic.fna.gz GRCh38
# GRCh37
bash seqrepo.sh GCF_000001405.26_GRCh37_genomic.fna.gz GRCh37
```

## Build Images for Each Assembly
```shell
# GRCh38
docker build --build-arg ASSEMBLY=GRCh38 --build-arg VRS_PYTHON_VERSION=2.1.2 -t ga4gh/vrs-vcf-annotator-grch38:2.1.2 .
# GRCh37
docker build --build-arg ASSEMBLY=GRCh37 --build-arg VRS_PYTHON_VERSION=2.1.2 -t ga4gh/vrs-vcf-annotator-grch37:2.1.2 .
```

## Running the Image in Docker
Run the image to annotate the VCF file `NA12878.vcf` in the current directory:
```shell
docker run -it --rm -v $(pwd):/input ga4gh/vrs-vcf-annotator-grch38:2.1.2 /input/NA12878.vcf --vcf-out /input/NA12878_with_vrs.vcf
```
Run the image to annotate the VCF file `NA12878.vcf` in the current directory and capture the VRS objects in a separate file:
```shell
docker run -it --rm -v $(pwd):/input ga4gh/vrs-vcf-annotator-grch38:2.1.2 /input/NA12878.vcf --vcf-out /input/NA12878_with_vrs.vcf --ndjson-out /input/vrs-objects.json
```
2 changes: 2 additions & 0 deletions vrs-in-a-box/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/sh
/vrs-python/venv/bin/vrs-annotate vcf --assembly ${ASSEMBLY} $@
69 changes: 69 additions & 0 deletions vrs-in-a-box/seqrepo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash
set -e # Exit on any error


# Check if an argument is provided
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <fasta_file> <assembly_name>"
exit 1
fi

FASTA_FILE=$1
ASSEMBLY_NAME=$2

export SEQREPO_ROOT=seqrepo-$ASSEMBLY_NAME

# Check if the file exists
if [ ! -f "$FASTA_FILE" ]; then
echo "ERROR: File '$FASTA_FILE' not found!" 1>&2
exit 1
fi

# Check if openrsync is in the output of /usr/bin/rsync --version
RSYNC_EXE=${RSYNC:-rsync}
if $RSYNC_EXE --version | grep -q "openrsync"; then
if [ -x "/opt/homebrew/bin/rsync" ]; then
RSYNC_OPTION="--rsync-exe /opt/homebrew/bin/rsync"
else
echo "ERROR: seqrepo requires 'rsync' (not 'openrsync') to run." 1>&2
echo " On MacOS rsync can be installed with Homebrew." 1>&2
echo " Set the RSYNC environment variable to specify the rsync executable to use." 1>&2
exit 1
fi
else
RSYNC_OPTION="--rsync-exe $RSYNC_EXE"
fi

# Initialize seqrepo
echo "Initializing seqrepo in $SEQREPO_ROOT..."
seqrepo -r seqrepo --root-directory $SEQREPO_ROOT $RSYNC_OPTION init

# Load the provided FASTA file
echo "Loading FASTA file: $FASTA_FILE"
seqrepo -r seqrepo --root-directory $SEQREPO_ROOT $RSYNC_OPTION load "$FASTA_FILE" -n NCBI

# Add assembly names
echo "Adding assembly names..."
seqrepo -r seqrepo --root-directory $SEQREPO_ROOT $RSYNC_OPTION add-assembly-names

echo "Initial seqrepo build completed"

# vrs-annotate checks that GRCh38:1 is in the database as a sanity check
# so for non-GRCh38, we have to add a dummy record so it will run
if [ "$ASSEMBLY_NAME" != "GRCh38" ]; then
sqlite3 $SEQREPO_ROOT/master/aliases.sqlite3 <<EOF
INSERT INTO seqalias (seq_id, namespace, alias, added, is_current)
VALUES ('Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO', 'GRCh38', '1', DATE('now'), 1);
EOF
sqlite3 $SEQREPO_ROOT/master/sequences/db.sqlite3 <<EOF
INSERT INTO seqinfo (seq_id, len, alpha, added, relpath)
VALUES ('Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO', 248956422, 'ACGMNRT', date('now'), '2025/0331/1411/1743430287.165611.fa.bgz');
EOF
echo "Spiked in GRCh38:1 sequence alias to make vrs-annotate run"
fi
# Create a tar.gz archive of the seqrepo directory
echo "Creating seqrepo archive..."
zip -r "${SEQREPO_ROOT}.zip" ${SEQREPO_ROOT}
rm -fR $SEQREPO_ROOT

echo "Archive created ${SEQREPO_ROOT}.zip"
Loading