Skip to content

Commit bbc5eed

Browse files
committed
[EGGO-30] Generate flattened, partitioned data.
1 parent 4877751 commit bbc5eed

File tree

4 files changed

+49
-3
lines changed

4 files changed

+49
-3
lines changed

eggo/dag.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,40 @@ def output(self):
336336
edition=self.edition))
337337

338338

339+
class ADAMFlattenPartitionTask(Task):
340+
341+
adam_command = Parameter()
342+
allowed_file_formats = Parameter()
343+
source_edition = 'flat'
344+
edition = 'flat_locuspart'
345+
346+
def requires(self):
347+
return ADAMBasicTask(adam_command=self.adam_command,
348+
allowed_file_formats=self.allowed_file_formats)
349+
350+
def run(self):
351+
adam_cmd = ('{adam_home}/bin/adam-submit --master {spark_master_url} partition'
352+
' -partition_strategy_file {partition_strategy_file}'
353+
' {source} {target}').format(
354+
adam_home=os.environ['ADAM_HOME'],
355+
spark_master_url=os.environ['SPARK_MASTER_URL'],
356+
partition_strategy_file='genotypes-flat-partition-strategy.json',
357+
source=target_s3n_url(ToastConfig().config['name'],
358+
edition=self.source_edition),
359+
target=target_s3n_url(ToastConfig().config['name'],
360+
edition=self.edition))
361+
p = Popen(adam_cmd, shell=True)
362+
p.wait()
363+
364+
if p.returncode == 0:
365+
create_SUCCESS_file(target_s3_url(ToastConfig().config['name'],
366+
edition=self.edition))
367+
368+
def output(self):
369+
return S3FlagTarget(target_s3_url(ToastConfig().config['name'],
370+
edition=self.edition))
371+
372+
339373
class VCF2ADAMTask(Task):
340374

341375
def requires(self):
@@ -344,7 +378,9 @@ def requires(self):
344378
flat = ADAMFlattenTask(adam_command='vcf2adam',
345379
allowed_file_formats=['vcf'])
346380
locuspart = ADAMPartitionTask(adam_command='vcf2adam',
347-
allowed_file_formats=['vcf'])
381+
allowed_file_formats=['vcf'])
382+
flat_locuspart = ADAMFlattenPartitionTask(adam_command='vcf2adam',
383+
allowed_file_formats=['vcf'])
348384
dependencies = [basic]
349385
for edition in ToastConfig().config['editions']:
350386
if edition == 'basic':
@@ -353,6 +389,8 @@ def requires(self):
353389
dependencies.append(flat)
354390
elif edition == 'locuspart':
355391
dependencies.append(locuspart)
392+
elif edition == 'flat_locuspart':
393+
dependencies.append(flat_locuspart)
356394
return dependencies
357395

358396
def run(self):
@@ -371,6 +409,8 @@ def requires(self):
371409
allowed_file_formats=['sam', 'bam'])
372410
locuspart = ADAMPartitionTask(adam_command='transform',
373411
allowed_file_formats=['sam', 'bam'])
412+
flat_locuspart = ADAMFlattenPartitionTask(adam_command='transform',
413+
allowed_file_formats=['sam', 'bam'])
374414
dependencies = [basic]
375415
for edition in ToastConfig().config['editions']:
376416
if edition == 'basic':
@@ -379,5 +419,7 @@ def requires(self):
379419
dependencies.append(flat)
380420
elif edition == 'locuspart':
381421
dependencies.append(locuspart)
422+
elif edition == 'flat_locuspart':
423+
dependencies.append(flat_locuspart)
382424
return dependencies
383425

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[
2+
{ "type": "identity", "source": "variant__contig__contigName", "name": "chr" },
3+
{ "type": "range", "source": "variant__start", "name": "pos", "range": 1000000 }
4+
]

genotypes-partition-strategy.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[
22
{ "type": "identity", "source": "variant.contig.contigName", "name": "chr" },
3-
{ "type": "range", "source": "variant.start", "name": "pos", "range": 10000 }
3+
{ "type": "range", "source": "variant.start", "name": "pos", "range": 1000000 }
44
]

test/registry/test-genotypes.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"title": "Test 1000 Genomes Project VCF data",
44
"target": "test/genotypes",
55
"dag": "VCF2ADAMTask",
6-
"editions": ["basic", "locuspart"],
6+
"editions": ["basic", "flat", "locuspart", "flat_locuspart"],
77
"sources": [
88
{"format": "vcf", "compression": true, "url": "https://github.com/bigdatagenomics/eggo/raw/master/test/resources/chr22.small.vcf.gz"}
99
]

0 commit comments

Comments
 (0)