dib-lab · bluegenes · Sep 24, 2020
diff --git a/charcoal/Snakefile b/charcoal/Snakefile
@@ -217,21 +217,23 @@ rule search_all:
     """
 
 # generate contigs taxonomy
-rule make_contigs_taxonomy_json:
+rule make_taxonomy_json:
     input:
         genome = genome_dir + '/{f}',
         genome_sig = output_dir + '/{f}.sig',
         matches = output_dir + '/{f}.matches.sig',
         lineages = config['lineages_csv']
     output:
-        json=output_dir + '/{f}.contigs-tax.json',
+        contig_json=output_dir + '/{f}.contigs-tax.json',
+        genome_json=output_dir + '/{f}.genome-tax.json',
     conda: 'conf/env-sourmash.yml'
     shell: """
-        python -m charcoal.contigs_search \
+        python -m charcoal.gather_taxonomy \
             --genome {input.genome} --lineages-csv {input.lineages} \
             --genome-sig {input.genome_sig} \
             --matches-sig {input.matches} \
-            --json-out {output.json}
+            --contigs-json-out {output.contig_json} \
+            --genome-json-out {output.genome_json}
     """
 
 # actually do cleaning
@@ -301,8 +303,8 @@ rule combined_summary:
 # compare all the genomes.
 rule compare_taxonomy:
     input:
-        all_json=expand(output_dir + '/{g}.contigs-tax.json', g=genome_list),
-        all_sig=expand(output_dir + '/{g}.matches.sig', g=genome_list),
+        all_contigs_json=expand(output_dir + '/{g}.contigs-tax.json', g=genome_list),
+        all_genome_json=expand(output_dir + '/{g}.genome-tax.json', g=genome_list),
         lineages = config['lineages_csv'],
         provided_lineages = provided_lineages_file,
         genome_list = genome_list_file

diff --git a/charcoal/compare_taxonomy.py b/charcoal/compare_taxonomy.py
@@ -63,22 +63,23 @@ def calculate_clean(genome_lin, contigs_d, rank):
     return (good_names, bad_names)
 
 
-def guess_tax_by_gather(entire_mh, lca_db, lin_db, match_rank, report_fp):
+def guess_tax_by_gather(gather_results, num_hashes, match_rank, report_fp, minimum_matches=GATHER_MIN_MATCHES):
     "Guess likely taxonomy using gather."
     sum_ident = 0
     first_lin = ()
     first_count = 0
-
-    for lin, count in gather_at_rank(entire_mh, lca_db, lin_db, match_rank):
-        if count >= GATHER_MIN_MATCHES:
+    # if match_rank is genus, this should be same as using gather_results
+    rank_gather = summarize_at_rank(gather_results, match_rank)
+    for lin, count in rank_gather:
+        if count >= minimum_matches:
             # record the first lineage we come across as likely lineage.
             if not first_lin:
                 first_lin = lin
                 first_count = count
 
         sum_ident += count
 
-    f_ident = sum_ident / len(entire_mh)
+    f_ident = sum_ident / num_hashes
     f_major = first_count / sum_ident
 
     return first_lin, f_ident, f_major
@@ -117,71 +118,28 @@ def choose_genome_lineage(guessed_genome_lineage, provided_lineage, match_rank,
     return genome_lineage, comment, needs_lineage
 
 
-def get_genome_taxonomy(matches_filename, genome_sig_filename, provided_lineage,
+def get_genome_taxonomy(genome_name, genome_gather_json_filename, provided_lineage,
                         tax_assign, match_rank, min_f_ident, min_f_major):
-    with open(matches_filename, 'rt') as fp:
-        try:
-            siglist = list(sourmash.load_signatures(fp, do_raise=True, quiet=True))
-        except sourmash.exceptions.SourmashError:
-            siglist = None
-
-    if not siglist:
-        comment = 'no matches for this genome.'
-        print(comment)
-        return None, comment, False, 0.0, 0.0
-
-    # construct a template minhash object that we can use to create new 'uns
-    empty_mh = siglist[0].minhash.copy_and_clear()
-    ksize = empty_mh.ksize
-    scaled = empty_mh.scaled
-    moltype = empty_mh.moltype
-
-    genome_sig = sourmash.load_one_signature(genome_sig_filename)
-    entire_mh = genome_sig.minhash
-
-    assert entire_mh.scaled == scaled
-
-    # Hack for examining members of our search database: remove exact matches.
-    new_siglist = []
-    for ss in siglist:
-        if entire_mh.similarity(ss.minhash) < 1.0:
-            new_siglist.append(ss)
-        else:
-            if provided_lineage and provided_lineage != 'NA':
-                print(f'found exact match: {ss.name()}. removing.')
-            else:
-                print(f'found exact match: {ss.name()}. but no provided lineage!')
-                comment = f'found exact match: {ss.name()}. but no provided lineage! cannot analyze.'
-                return None, comment, True, 1.0, 1.0
-
-    # ...but leave exact matches in if they're the only matches, I guess!
-    if new_siglist:
-        siglist = new_siglist
 
-    # create empty LCA database to populate...
-    lca_db = LCA_Database(ksize=ksize, scaled=scaled, moltype=moltype)
-    lin_db = LineageDB()
+    guessed_genome_lineage, f_major, f_ident = "", 0.0, 0.0
+    # did we get gather results?
+    genome_info = utils.load_contigs_gather_json(genome_gather_json_filename)
 
-    # ...with specific matches.
-    for ss in siglist:
-        ident = get_ident(ss)
-        lineage = tax_assign[ident]
+    if genome_info:
+        gather_results = genome_info[genome_name].gather_tax
+        genome_len = genome_info[genome_name].length
+        genome_hashes = genome_info[genome_name].num_hashes
 
-        lca_db.insert(ss, ident=ident)
-        lin_db.insert(ident, lineage)
 
-    print(f'loaded {len(siglist)} signatures & created LCA Database')
+        # calculate lineage from majority vote on LCA
+        guessed_genome_lineage, f_major, f_ident = guess_tax_by_gather(gather_results, genome_hashes, match_rank, sys.stdout)
 
-    # calculate lineage from majority vote on LCA
-    guessed_genome_lineage, f_major, f_ident = \
-         guess_tax_by_gather(entire_mh, lca_db, lin_db, match_rank, sys.stdout)
+        print(f'Gather classification on this genome yields: {pretty_print_lineage(guessed_genome_lineage)}')
 
-    print(f'Gather classification on this genome yields: {pretty_print_lineage(guessed_genome_lineage)}')
-
-    if f_major == 1.0 and f_ident == 1.0:
-        comment = "All genome hashes belong to one lineage! Nothing to do."
-        print(comment)
-        return guessed_genome_lineage, comment, False, f_major, f_ident
+        if f_major == 1.0 and f_ident == 1.0:
+            comment = "All genome hashes belong to one lineage! Nothing to do."
+            print(comment)
+            return guessed_genome_lineage, comment, False, f_major, f_ident
 
     # did we get a passed-in lineage assignment?
     provided_lin = ""
@@ -236,13 +194,12 @@ def main(args):
     # process every genome individually.
     summary_d = {}
     for n, genome_name in enumerate(genome_names):
-        matches_filename = os.path.join(dirname, genome_name + '.matches.sig')
-        genome_sig = os.path.join(dirname, genome_name + '.sig')
         lineage = provided_lineages.get(genome_name, '')
+        genome_json = os.path.join(dirname, genome_name + '.genome-tax.json')
         contigs_json = os.path.join(dirname, genome_name + '.contigs-tax.json')
 
-        x = get_genome_taxonomy(matches_filename,
-                                genome_sig,
+        x = get_genome_taxonomy(genome_name,
+                                genome_json,
                                 lineage,
                                 tax_assign, match_rank,
                                 args.min_f_ident,
@@ -354,7 +311,7 @@ def main(args):
     # output a sorted hit list CSV
     fp = open(args.hit_list, 'wt')
     hitlist_w = csv.writer(fp)
-    
+
     hitlist_w.writerow(['genome', 'filter_at', 'override_filter_at',
         'total_bad_bp', 'superkingdom_bad_bp', 'phylum_bad_bp',
         'class_bad_bp', 'order_bad_bp', 'family_bad_bp', 'genus_bad_bp',

diff --git a/charcoal/contigs_search.py → charcoal/gather_taxonomy.py b/charcoal/contigs_search.py → charcoal/gather_taxonomy.py
@@ -54,9 +54,11 @@ def main(args):
     # complain.)
     if not siglist:
         print('no non-identical matches for this genome, exiting.')
-        contigs_tax = {}
-        with open(args.json_out, 'wt') as fp:
+        contigs_tax, genome_tax = {},{}
+        with open(args.contigs_json_out, 'wt') as fp:
             fp.write(json.dumps(contigs_tax))
+        with open(args.genome_json_out, 'wt') as fp:
+            fp.write(json.dumps(genome_tax))
         return 0
 
     # construct a template minhash object that we can use to create new 'uns
@@ -83,7 +85,8 @@ def main(args):
     print(f'reading contigs from {genomebase}')
 
     screed_iter = screed.open(args.genome)
-    contigs_tax = {}
+    contigs_tax, genome_tax = {},{}
+    # contigs search
     for n, record in enumerate(screed_iter):
         # look at each contig individually
         mh = empty_mh.copy_and_clear()
@@ -100,9 +103,25 @@ def main(args):
     print(f"Processed {len(contigs_tax)} contigs.")
 
     # save!
-    with open(args.json_out, 'wt') as fp:
+    with open(args.contigs_json_out, 'wt') as fp:
         fp.write(json.dumps(contigs_tax))
 
+
+    # genome search
+    genome_len=0
+    entire_mh = genome_sig.minhash
+    genome_name = os.path.basename(genome_sig.name())
+    num_hashes = len(entire_mh.hashes)
+    if not genome_len:
+        for record in screed_iter:
+            genome_len+=len(record.sequence)
+    gather_results = list(gather_at_rank(entire_mh, lca_db, lin_db, match_rank))
+    # currently treats genome as a giant contig, since we need the same info. Any reason to include addl info?
+    genome_gather_info = ContigGatherInfo(genome_len, len(entire_mh), gather_results)
+    genome_tax[genome_name] = genome_gather_info
+    with open(args.genome_json_out, 'wt') as fp:
+        fp.write(json.dumps(genome_tax))
+
     return 0
 
 
@@ -116,7 +135,10 @@ def cmdline(sys_args):
     p.add_argument('--force', help='continue past survivable errors',
                    action='store_true')
 
-    p.add_argument('--json-out',
+    p.add_argument('--contigs-json-out',
+                   help='JSON-format output file of all contig tax results',
+                   required=True)
+    p.add_argument('--genome-json-out',
                    help='JSON-format output file of all tax results',
                    required=True)
     args = p.parse_args()

diff --git a/tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.genome-tax.json b/tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.genome-tax.json
@@ -0,0 +1 @@
+{"LoombaR_2017__SID1050_bax__bin.11.fa.gz": [0, 2723, [[[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Acutalibacteraceae"], ["genus", "g__Anaeromassilibacillus"]], 1995], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Ruminococcaceae"], ["genus", "g__Angelakisella"]], 18], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes"], ["class", "c__Bacilli"], ["order", "o__Erysipelotrichales"], ["family", "f__Erysipelotrichaceae"], ["genus", "g__OEMR01"]], 14], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Ruminococcaceae"], ["genus", "g__Gemmiger_A"]], 10], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Acutalibacteraceae"], ["genus", "g__An200"]], 10], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Lachnospirales"], ["family", "f__Anaerotignaceae"], ["genus", "g__Anaerotignum"]], 9], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Oscillospiraceae"], ["genus", "g__Lawsonibacter"]], 8], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Oscillospiraceae"], ["genus", "g__Flavonifractor"]], 8], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Lachnospirales"], ["family", "f__Lachnospiraceae"], ["genus", "g__Blautia_A"]], 3], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Lachnospirales"], ["family", "f__Lachnospiraceae"], ["genus", "g__GCA-900066575"]], 2], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Ruminococcaceae"], ["genus", "g__Fournierella"]], 2], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Oscillospiraceae"], ["genus", "g__Pseudoflavonifractor"]], 1], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes"], ["class", "c__Bacilli"], ["order", "o__Erysipelotrichales"], ["family", "f__Erysipelotrichaceae"], ["genus", "g__Merdibacter"]], 1]]]}
diff --git a/tests/test_contigs_search.py → tests/test_gather_taxonomy.py b/tests/test_contigs_search.py → tests/test_gather_taxonomy.py
@@ -2,7 +2,7 @@
 from . import pytest_utils as utils
 import json
 
-from charcoal import contigs_search
+from charcoal import gather_taxonomy
 
 
 @utils.in_tempdir
@@ -13,14 +13,19 @@ def test_1(location):
     args.genome_sig = utils.relative_file("tests/test-data/genomes/2.fa.gz.sig")
     args.matches_sig = utils.relative_file("tests/test-data/2.fa.gz.gather-matches.sig.gz")
     args.lineages_csv = utils.relative_file("tests/test-data/test-match-lineages.csv")
-    args.json_out = os.path.join(location, 'tax.json')
+    args.contigs_json_out = os.path.join(location, 'contigs-tax.json')
+    args.genome_json_out = os.path.join(location, 'genome-tax.json')
 
-    status = contigs_search.main(args)
+    status = gather_taxonomy.main(args)
 
     assert status == 0
-    assert os.path.exists(args.json_out)
+    assert os.path.exists(args.contigs_json_out)
+    assert os.path.exists(args.genome_json_out)
 
-    with open(args.json_out, 'rt') as fp:
+    with open(args.contigs_json_out, 'rt') as fp:
+        results = json.load(fp)
+        assert results == {}
+    with open(args.genome_json_out, 'rt') as fp:
         results = json.load(fp)
         assert results == {}
 
@@ -33,18 +38,30 @@ def test_2_loomba(location):
     args.genome_sig = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.sig")
     args.matches_sig = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.matches.sig")
     args.lineages_csv = utils.relative_file("tests/test-data/test-match-lineages.csv")
-    args.json_out = os.path.join(location, 'tax.json')
+    args.contigs_json_out = os.path.join(location, 'contigs-tax.json')
+    args.genome_json_out = os.path.join(location, 'genome-tax.json')
 
-    status = contigs_search.main(args)
+    status = gather_taxonomy.main(args)
 
     assert status == 0
-    assert os.path.exists(args.json_out)
+    assert os.path.exists(args.contigs_json_out)
 
-    with open(args.json_out, 'rt') as fp:
+    # check contig results
+    with open(args.contigs_json_out, 'rt') as fp:
         this_results = json.load(fp)
 
     saved_results_file = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.contigs-tax.json")
     with open(saved_results_file, 'rt') as fp:
         saved_results = json.load(fp)
 
     assert this_results == saved_results
+
+    # check genome results
+    with open(args.genome_json_out, 'rt') as fp:
+        this_results = json.load(fp)
+
+    saved_results_file = utils.relative_file("tests/test-data/loomba/LoombaR_2017__SID1050_bax__bin.11.fa.gz.genome-tax.json")
+    with open(saved_results_file, 'rt') as fp:
+        saved_results = json.load(fp)
+
+    assert this_results == saved_results
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"LoombaR_2017__SID1050_bax__bin.11.fa.gz": [0, 2723, [[[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Acutalibacteraceae"], ["genus", "g__Anaeromassilibacillus"]], 1995], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Ruminococcaceae"], ["genus", "g__Angelakisella"]], 18], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes"], ["class", "c__Bacilli"], ["order", "o__Erysipelotrichales"], ["family", "f__Erysipelotrichaceae"], ["genus", "g__OEMR01"]], 14], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Ruminococcaceae"], ["genus", "g__Gemmiger_A"]], 10], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Acutalibacteraceae"], ["genus", "g__An200"]], 10], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Lachnospirales"], ["family", "f__Anaerotignaceae"], ["genus", "g__Anaerotignum"]], 9], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Oscillospiraceae"], ["genus", "g__Lawsonibacter"]], 8], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Oscillospiraceae"], ["genus", "g__Flavonifractor"]], 8], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Lachnospirales"], ["family", "f__Lachnospiraceae"], ["genus", "g__Blautia_A"]], 3], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Lachnospirales"], ["family", "f__Lachnospiraceae"], ["genus", "g__GCA-900066575"]], 2], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Ruminococcaceae"], ["genus", "g__Fournierella"]], 2], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes_A"], ["class", "c__Clostridia"], ["order", "o__Oscillospirales"], ["family", "f__Oscillospiraceae"], ["genus", "g__Pseudoflavonifractor"]], 1], [[["superkingdom", "d__Bacteria"], ["phylum", "p__Firmicutes"], ["class", "c__Bacilli"], ["order", "o__Erysipelotrichales"], ["family", "f__Erysipelotrichaceae"], ["genus", "g__Merdibacter"]], 1]]]}