Skip to content

Commit ec619d2

Browse files
committed
[EGGO-30] Add support for partitioning BAM/SAM.
1 parent 4e2b9d4 commit ec619d2

File tree

5 files changed

+23
-16
lines changed

5 files changed

+23
-16
lines changed

eggo/dag.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,7 @@ class ADAMPartitionTask(Task):
309309

310310
adam_command = Parameter()
311311
allowed_file_formats = Parameter()
312+
partition_strategy_file = Parameter()
312313
source_edition = 'basic'
313314
edition = 'locuspart'
314315

@@ -323,7 +324,7 @@ def run(self):
323324
hadoop_home=os.environ['HADOOP_HOME'],
324325
adam_partitioning_jar=os.environ['ADAM_PARTITIONING_JAR'],
325326
parallelism=1,
326-
partition_strategy_file='genotypes-partition-strategy',
327+
partition_strategy_file=self.partition_strategy_file,
327328
source=target_s3n_url(ToastConfig().config['name'],
328329
edition=self.source_edition),
329330
target=target_s3n_url(ToastConfig().config['name'],
@@ -344,6 +345,7 @@ class ADAMFlattenPartitionTask(Task):
344345

345346
adam_command = Parameter()
346347
allowed_file_formats = Parameter()
348+
partition_strategy_file = Parameter()
347349
source_edition = 'flat'
348350
edition = 'flat_locuspart'
349351

@@ -358,7 +360,7 @@ def run(self):
358360
hadoop_home=os.environ['HADOOP_HOME'],
359361
adam_partitioning_jar=os.environ['ADAM_PARTITIONING_JAR'],
360362
parallelism=1,
361-
partition_strategy_file='flat-genotypes-partition-strategy',
363+
partition_strategy_file=self.partition_strategy_file,
362364
source=target_s3n_url(ToastConfig().config['name'],
363365
edition=self.source_edition),
364366
target=target_s3n_url(ToastConfig().config['name'],
@@ -383,9 +385,11 @@ def requires(self):
383385
flat = ADAMFlattenTask(adam_command='vcf2adam',
384386
allowed_file_formats=['vcf'])
385387
locuspart = ADAMPartitionTask(adam_command='vcf2adam',
386-
allowed_file_formats=['vcf'])
388+
allowed_file_formats=['vcf'],
389+
partition_strategy_file='genotypes-partition-strategy')
387390
flat_locuspart = ADAMFlattenPartitionTask(adam_command='vcf2adam',
388-
allowed_file_formats=['vcf'])
391+
allowed_file_formats=['vcf'],
392+
partition_strategy_file='flat-genotypes-partition-strategy')
389393
dependencies = [basic]
390394
for edition in ToastConfig().config['editions']:
391395
if edition == 'basic':
@@ -413,9 +417,11 @@ def requires(self):
413417
flat = ADAMFlattenTask(adam_command='transform',
414418
allowed_file_formats=['sam', 'bam'])
415419
locuspart = ADAMPartitionTask(adam_command='transform',
416-
allowed_file_formats=['sam', 'bam'])
420+
allowed_file_formats=['sam', 'bam'],
421+
partition_strategy_file='alignments-partition-strategy')
417422
flat_locuspart = ADAMFlattenPartitionTask(adam_command='transform',
418-
allowed_file_formats=['sam', 'bam'])
423+
allowed_file_formats=['sam', 'bam'],
424+
partition_strategy_file='flat-alignments-partition-strategy')
419425
dependencies = [basic]
420426
for edition in ToastConfig().config['editions']:
421427
if edition == 'basic':

genotypes-flat-partition-strategy.json

Lines changed: 0 additions & 4 deletions
This file was deleted.

genotypes-partition-strategy.json

Lines changed: 0 additions & 4 deletions
This file was deleted.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"name": "test-1kg-genotypes-subset",
3+
"title": "Test 1000 Genomes Project VCF data",
4+
"dag": "VCF2ADAMTask",
5+
"editions": ["basic", "flat", "locuspart", "flat_locuspart"],
6+
"sources": [
7+
{"format": "vcf", "compression": true, "url": "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20110521/ALL.chr22.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz"}
8+
]
9+
}

test/registry/test-alignments.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
"name": "test-alignments",
33
"title": "Test SAM data",
44
"dag": "BAM2ADAMTask",
5-
"editions": ["basic", "flat"],
5+
"editions": ["basic", "flat", "locuspart", "flat_locuspart"],
66
"sources": [
7-
{"format": "sam", "compression": false, "url": "https://raw.githubusercontent.com/bigdatagenomics/adam/master/adam-core/src/test/resources/reads12.sam"}
7+
{"format": "sam", "compression": false, "url": "https://github.com/bigdatagenomics/eggo/raw/master/test/resources/small.sam"}
88
]
99
}

0 commit comments

Comments
 (0)