Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions charcoal/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -217,21 +217,23 @@ rule search_all:
"""

# generate contigs taxonomy
rule make_contigs_taxonomy_json:
rule make_taxonomy_json:
input:
genome = genome_dir + '/{f}',
genome_sig = output_dir + '/{f}.sig',
matches = output_dir + '/{f}.matches.sig',
lineages = config['lineages_csv']
output:
json=output_dir + '/{f}.contigs-tax.json',
contig_json=output_dir + '/{f}.contigs-tax.json',
genome_json=output_dir + '/{f}.genome-tax.json',
conda: 'conf/env-sourmash.yml'
shell: """
python -m charcoal.contigs_search \
python -m charcoal.gather_taxonomy \
--genome {input.genome} --lineages-csv {input.lineages} \
--genome-sig {input.genome_sig} \
--matches-sig {input.matches} \
--json-out {output.json}
--contigs-json-out {output.contig_json} \
--genome-json-out {output.genome_json}
"""

# actually do cleaning
Expand Down Expand Up @@ -301,8 +303,8 @@ rule combined_summary:
# compare all the genomes.
rule compare_taxonomy:
input:
all_json=expand(output_dir + '/{g}.contigs-tax.json', g=genome_list),
all_sig=expand(output_dir + '/{g}.matches.sig', g=genome_list),
all_contigs_json=expand(output_dir + '/{g}.contigs-tax.json', g=genome_list),
all_genome_json=expand(output_dir + '/{g}.genome-tax.json', g=genome_list),
lineages = config['lineages_csv'],
provided_lineages = provided_lineages_file,
genome_list = genome_list_file
Expand Down
93 changes: 25 additions & 68 deletions charcoal/compare_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,22 +63,23 @@ def calculate_clean(genome_lin, contigs_d, rank):
return (good_names, bad_names)


def guess_tax_by_gather(entire_mh, lca_db, lin_db, match_rank, report_fp):
def guess_tax_by_gather(gather_results, num_hashes, match_rank, report_fp, minimum_matches=GATHER_MIN_MATCHES):
"Guess likely taxonomy using gather."
sum_ident = 0
first_lin = ()
first_count = 0

for lin, count in gather_at_rank(entire_mh, lca_db, lin_db, match_rank):
if count >= GATHER_MIN_MATCHES:
# if match_rank is genus, this should be same as using gather_results
rank_gather = summarize_at_rank(gather_results, match_rank)
for lin, count in rank_gather:
if count >= minimum_matches:
# record the first lineage we come across as likely lineage.
if not first_lin:
first_lin = lin
first_count = count

sum_ident += count

f_ident = sum_ident / len(entire_mh)
f_ident = sum_ident / num_hashes
f_major = first_count / sum_ident

return first_lin, f_ident, f_major
Expand Down Expand Up @@ -117,71 +118,28 @@ def choose_genome_lineage(guessed_genome_lineage, provided_lineage, match_rank,
return genome_lineage, comment, needs_lineage


def get_genome_taxonomy(matches_filename, genome_sig_filename, provided_lineage,
def get_genome_taxonomy(genome_name, genome_gather_json_filename, provided_lineage,
tax_assign, match_rank, min_f_ident, min_f_major):
with open(matches_filename, 'rt') as fp:
try:
siglist = list(sourmash.load_signatures(fp, do_raise=True, quiet=True))
except sourmash.exceptions.SourmashError:
siglist = None

if not siglist:
comment = 'no matches for this genome.'
print(comment)
return None, comment, False, 0.0, 0.0

# construct a template minhash object that we can use to create new 'uns
empty_mh = siglist[0].minhash.copy_and_clear()
ksize = empty_mh.ksize
scaled = empty_mh.scaled
moltype = empty_mh.moltype

genome_sig = sourmash.load_one_signature(genome_sig_filename)
entire_mh = genome_sig.minhash

assert entire_mh.scaled == scaled

# Hack for examining members of our search database: remove exact matches.
new_siglist = []
for ss in siglist:
if entire_mh.similarity(ss.minhash) < 1.0:
new_siglist.append(ss)
else:
if provided_lineage and provided_lineage != 'NA':
print(f'found exact match: {ss.name()}. removing.')
else:
print(f'found exact match: {ss.name()}. but no provided lineage!')
comment = f'found exact match: {ss.name()}. but no provided lineage! cannot analyze.'
return None, comment, True, 1.0, 1.0

# ...but leave exact matches in if they're the only matches, I guess!
if new_siglist:
siglist = new_siglist

# create empty LCA database to populate...
lca_db = LCA_Database(ksize=ksize, scaled=scaled, moltype=moltype)
lin_db = LineageDB()
guessed_genome_lineage, f_major, f_ident = "", 0.0, 0.0
# did we get gather results?
genome_info = utils.load_contigs_gather_json(genome_gather_json_filename)

# ...with specific matches.
for ss in siglist:
ident = get_ident(ss)
lineage = tax_assign[ident]
if genome_info:
gather_results = genome_info[genome_name].gather_tax
genome_len = genome_info[genome_name].length
genome_hashes = genome_info[genome_name].num_hashes

lca_db.insert(ss, ident=ident)
lin_db.insert(ident, lineage)

print(f'loaded {len(siglist)} signatures & created LCA Database')
# calculate lineage from majority vote on LCA
guessed_genome_lineage, f_major, f_ident = guess_tax_by_gather(gather_results, genome_hashes, match_rank, sys.stdout)

# calculate lineage from majority vote on LCA
guessed_genome_lineage, f_major, f_ident = \
guess_tax_by_gather(entire_mh, lca_db, lin_db, match_rank, sys.stdout)
print(f'Gather classification on this genome yields: {pretty_print_lineage(guessed_genome_lineage)}')

print(f'Gather classification on this genome yields: {pretty_print_lineage(guessed_genome_lineage)}')

if f_major == 1.0 and f_ident == 1.0:
comment = "All genome hashes belong to one lineage! Nothing to do."
print(comment)
return guessed_genome_lineage, comment, False, f_major, f_ident
if f_major == 1.0 and f_ident == 1.0:
comment = "All genome hashes belong to one lineage! Nothing to do."
print(comment)
return guessed_genome_lineage, comment, False, f_major, f_ident

# did we get a passed-in lineage assignment?
provided_lin = ""
Expand Down Expand Up @@ -236,13 +194,12 @@ def main(args):
# process every genome individually.
summary_d = {}
for n, genome_name in enumerate(genome_names):
matches_filename = os.path.join(dirname, genome_name + '.matches.sig')
genome_sig = os.path.join(dirname, genome_name + '.sig')
lineage = provided_lineages.get(genome_name, '')
genome_json = os.path.join(dirname, genome_name + '.genome-tax.json')
contigs_json = os.path.join(dirname, genome_name + '.contigs-tax.json')

x = get_genome_taxonomy(matches_filename,
genome_sig,
x = get_genome_taxonomy(genome_name,
genome_json,
lineage,
tax_assign, match_rank,
args.min_f_ident,
Expand Down Expand Up @@ -354,7 +311,7 @@ def main(args):
# output a sorted hit list CSV
fp = open(args.hit_list, 'wt')
hitlist_w = csv.writer(fp)

hitlist_w.writerow(['genome', 'filter_at', 'override_filter_at',
'total_bad_bp', 'superkingdom_bad_bp', 'phylum_bad_bp',
'class_bad_bp', 'order_bad_bp', 'family_bad_bp', 'genus_bad_bp',
Expand Down
32 changes: 27 additions & 5 deletions charcoal/contigs_search.py → charcoal/gather_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,11 @@ def main(args):
# complain.)
if not siglist:
print('no non-identical matches for this genome, exiting.')
contigs_tax = {}
with open(args.json_out, 'wt') as fp:
contigs_tax, genome_tax = {},{}
with open(args.contigs_json_out, 'wt') as fp:
fp.write(json.dumps(contigs_tax))
with open(args.genome_json_out, 'wt') as fp:
fp.write(json.dumps(genome_tax))
return 0

# construct a template minhash object that we can use to create new 'uns
Expand All @@ -83,7 +85,8 @@ def main(args):
print(f'reading contigs from {genomebase}')

screed_iter = screed.open(args.genome)
contigs_tax = {}
contigs_tax, genome_tax = {},{}
# contigs search
for n, record in enumerate(screed_iter):
# look at each contig individually
mh = empty_mh.copy_and_clear()
Expand All @@ -100,9 +103,25 @@ def main(args):
print(f"Processed {len(contigs_tax)} contigs.")

# save!
with open(args.json_out, 'wt') as fp:
with open(args.contigs_json_out, 'wt') as fp:
fp.write(json.dumps(contigs_tax))


# genome search
genome_len=0
entire_mh = genome_sig.minhash
genome_name = os.path.basename(genome_sig.name())
num_hashes = len(entire_mh.hashes)
if not genome_len:
for record in screed_iter:
genome_len+=len(record.sequence)
gather_results = list(gather_at_rank(entire_mh, lca_db, lin_db, match_rank))
# currently treats genome as a giant contig, since we need the same info. Any reason to include addl info?
genome_gather_info = ContigGatherInfo(genome_len, len(entire_mh), gather_results)
genome_tax[genome_name] = genome_gather_info
with open(args.genome_json_out, 'wt') as fp:
fp.write(json.dumps(genome_tax))

return 0


Expand All @@ -116,7 +135,10 @@ def cmdline(sys_args):
p.add_argument('--force', help='continue past survivable errors',
action='store_true')

p.add_argument('--json-out',
p.add_argument('--contigs-json-out',
help='JSON-format output file of all contig tax results',
required=True)
p.add_argument('--genome-json-out',
help='JSON-format output file of all tax results',
required=True)
args = p.parse_args()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"LoombaR_2017__SID1050_bax__bin.11.fa.gz": [0, 2723, [[[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Acutalibacteraceae"], ["genus", "g__Anaeromassilibacillus"]], 1995], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Ruminococcaceae"], ["genus", "g__Angelakisella"]], 18], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes"], ["class", "c__Bacilli"], ["order", "o__Erysipelotrichales"], ["family", "f__Erysipelotrichaceae"], ["genus", "g__OEMR01"]], 14], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Ruminococcaceae"], ["genus", "g__Gemmiger_A"]], 10], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Acutalibacteraceae"], ["genus", "g__An200"]], 10], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Lachnospirales"], ["family", "f__Anaerotignaceae"], ["genus", "g__Anaerotignum"]], 9], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Oscillospiraceae"], ["genus", "g__Lawsonibacter"]], 8], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Oscillospiraceae"], ["genus", "g__Flavonifractor"]], 8], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Lachnospirales"], ["family", "f__Lachnospiraceae"], ["genus", "g__Blautia_A"]], 3], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Lachnospirales"], ["family", "f__Lachnospiraceae"], ["genus", "g__GCA-900066575"]], 2], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Ruminococcaceae"], ["genus", "g__Fournierella"]], 2], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Oscillospiraceae"], ["genus", "g__Pseudoflavonifractor"]], 1], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes"], ["class", "c__Bacilli"], ["order", "o__Erysipelotrichales"], ["family", "f__Erysipelotrichaceae"], ["genus", "g__Merdibacter"]], 1]]]}
35 changes: 26 additions & 9 deletions tests/test_contigs_search.py → tests/test_gather_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from . import pytest_utils as utils
import json

from charcoal import contigs_search
from charcoal import gather_taxonomy


@utils.in_tempdir
Expand All @@ -13,14 +13,19 @@ def test_1(location):
args.genome_sig = utils.relative_file("tests/test-data/genomes/2.fa.gz.sig")
args.matches_sig = utils.relative_file("tests/test-data/2.fa.gz.gather-matches.sig.gz")
args.lineages_csv = utils.relative_file("tests/test-data/test-match-lineages.csv")
args.json_out = os.path.join(location, 'tax.json')
args.contigs_json_out = os.path.join(location, 'contigs-tax.json')
args.genome_json_out = os.path.join(location, 'genome-tax.json')

status = contigs_search.main(args)
status = gather_taxonomy.main(args)

assert status == 0
assert os.path.exists(args.json_out)
assert os.path.exists(args.contigs_json_out)
assert os.path.exists(args.genome_json_out)

with open(args.json_out, 'rt') as fp:
with open(args.contigs_json_out, 'rt') as fp:
results = json.load(fp)
assert results == {}
with open(args.genome_json_out, 'rt') as fp:
results = json.load(fp)
assert results == {}

Expand All @@ -33,18 +38,30 @@ def test_2_loomba(location):
args.genome_sig = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.sig")
args.matches_sig = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.matches.sig")
args.lineages_csv = utils.relative_file("tests/test-data/test-match-lineages.csv")
args.json_out = os.path.join(location, 'tax.json')
args.contigs_json_out = os.path.join(location, 'contigs-tax.json')
args.genome_json_out = os.path.join(location, 'genome-tax.json')

status = contigs_search.main(args)
status = gather_taxonomy.main(args)

assert status == 0
assert os.path.exists(args.json_out)
assert os.path.exists(args.contigs_json_out)

with open(args.json_out, 'rt') as fp:
# check contig results
with open(args.contigs_json_out, 'rt') as fp:
this_results = json.load(fp)

saved_results_file = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.contigs-tax.json")
with open(saved_results_file, 'rt') as fp:
saved_results = json.load(fp)

assert this_results == saved_results

# check genome results
with open(args.genome_json_out, 'rt') as fp:
this_results = json.load(fp)

saved_results_file = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.genome-tax.json")
with open(saved_results_file, 'rt') as fp:
saved_results = json.load(fp)

assert this_results == saved_results