diff --git a/src/svaba/DiscordantCluster.cpp b/src/svaba/DiscordantCluster.cpp index d498253..9250bc7 100644 --- a/src/svaba/DiscordantCluster.cpp +++ b/src/svaba/DiscordantCluster.cpp @@ -10,6 +10,8 @@ #define DISC_PAD 150 #define MIN_PER_CLUST 2 +#define MAX_CLUSTER_WIDTH 3000 +#define MAX_REGION_WIDTH 5000 #define DEFAULT_ISIZE_THRESHOLD 2000 // shouldn't be hit if isize was learned //#define DEBUG_CLUSTER 1 @@ -154,28 +156,35 @@ DiscordantClusterMap DiscordantCluster::clusterReads(svabaReadPtrVector& bav, SeqLib::Orientation por; for (const auto& [_, read] : reads) { - por = read->PairOrientation(); - m_reg1.strand = read->ReverseFlag() ? '-' : '+'; - m_reg2.strand = read->MateReverseFlag() ? '-' : '+'; - m_reg1.chr = read->ChrID(); - m_reg2.chr = read->MateChrID(); - - // get left side - if (read->Position() < m_reg1.pos1) - m_reg1.pos1 = read->Position(); - if (read->MatePosition() < m_reg2.pos1) - m_reg2.pos1 = read->MatePosition(); - - // get right side - if (read->PositionEnd() > m_reg1.pos2) - m_reg1.pos2 = read->PositionEnd(); - if (read->MatePosition() > m_reg2.pos2) - m_reg2.pos2 = read->MatePosition() + read->Length(); // since don't have mate end - - assert(m_reg1.Width() < 5000); - assert(m_reg2.Width() < 5000); + por = read->PairOrientation(); + m_reg1.strand = read->ReverseFlag() ? '-' : '+'; + m_reg2.strand = read->MateReverseFlag() ? '-' : '+'; + m_reg1.chr = read->ChrID(); + m_reg2.chr = read->MateChrID(); + + // get left side + if (read->Position() < m_reg1.pos1) + m_reg1.pos1 = read->Position(); + if (read->MatePosition() < m_reg2.pos1) + m_reg2.pos1 = read->MatePosition(); + + // get right side + if (read->PositionEnd() > m_reg1.pos2) + m_reg1.pos2 = read->PositionEnd(); + if (read->MatePosition() > m_reg2.pos2) + m_reg2.pos2 = read->MatePosition() + read->Length(); // since don't have mate end + + // Check region width constraints after building regions + if (m_reg1.Width() >= MAX_REGION_WIDTH) { + std::cerr << "Warning: Region 1 width (" << m_reg1.Width() << "bp) exceeds " << MAX_REGION_WIDTH << "bp limit. Cluster is too large." << std::endl; + } + if (m_reg2.Width() >= MAX_REGION_WIDTH) { + std::cerr << "Warning: Region 2 width (" << m_reg2.Width() << "bp) exceeds " << MAX_REGION_WIDTH << "bp limit. Cluster is too large." << std::endl; + } + assert(m_reg1.Width() < 5000); + assert(m_reg2.Width() < 5000); } - + mapq1 = __getMeanMapq(reads); mapq2 = __getMeanMapq(mates); assert(mapq1 >= 0); @@ -433,126 +442,6 @@ bool DiscordantCluster::__add_read_to_cluster(svabaReadClusterVector& cvec, } } -/* - bool DiscordantCluster::__add_read_to_cluster(svabaReadClusterVector &cvec, - svabaReadVector &clust, - const svabaRead &a, - bool mate) { - - // get the position of the previous read. If none, we're starting a new one so make a dummy - std::pair last_info; - if (clust.size() == 0) - last_info = {-1, -1}; - else if (mate) - last_info = {clust.back().MateChrID(), clust.back().MatePosition()}; - else - last_info = {clust.back().ChrID(), clust.back().Position()}; - - // get the position of the current read - std::pair this_info; - if (mate) - this_info = {a.MateChrID(), a.MatePosition()}; - else - this_info = {a.ChrID(), a.Position()}; - - // is this cluster too big? happens if too many discordant reads. Enforce a hard cutoff - bool too_big; - if (mate) - too_big = (clust.size() > 1 && (clust.back().MatePosition() - clust[0].MatePosition()) > 3000); - else - too_big = (clust.size() > 1 && (clust.back().Position() - clust[0].Position()) > 3000); - - // note to self. this rarely gets hit? - //if (too_big) - // std::cerr << "Cluster too big at " << clust[0].Brief() << " to " << clust.back().Brief() << ". Breaking off after 3000bp" << std::endl; - - // check if this read is close enough to the last - if (!too_big && (this_info.first == last_info.first) && (this_info.second - last_info.second) <= DISC_PAD) { - - // read belongs in current cluster, so add - clust.push_back(a); - last_info = this_info; - return true; - - // read does not belong to cluster. close this cluster and add to cvec - } else { - - // if enough supporting reads, add as a cluster - if (clust.size() >= MIN_PER_CLUST) { - cvec.push_back(clust); - } - - // clear this cluster and start a new one - clust.clear(); - clust.push_back(a); - - return false; - } -} - - -void DiscordantCluster::__cluster_mate_reads(svabaReadClusterVector& brcv, svabaReadClusterVector& fwd, svabaReadClusterVector& rev) -{ - // loop through the clusters, and cluster within clusters based on mate read - for (auto& v : brcv) - { - - svabaReadVector this_fwd, this_rev; - std::sort(v.begin(), v.end(), SeqLib::BamRecordSort::ByMatePosition()); - - for (auto& r : v) - { - - // not a discordant read, ignore it - if (r.dd <= 0) - continue; - - // forward clustering - if (!r.MateReverseFlag()) - __add_read_to_cluster(fwd, this_fwd, r, true); - // reverse clustering - else if (r.MateReverseFlag()) - __add_read_to_cluster(rev, this_rev, r, true); - - } - // finish the last clusters - if (this_fwd.size() > 0) - fwd.push_back(this_fwd); - if (this_rev.size() > 0) - rev.push_back(this_rev); - } // finish main cluster loop - } -void DiscordantCluster::__cluster_mate_reads(std::vector>& brcv, - std::vector>& fwd, - std::vector>& rev) -{ - for (auto& cluster : brcv) { - std::vector this_fwd, this_rev; - - std::sort(cluster.begin(), cluster.end(), - [](const svabaRead* a, const svabaRead* b) { - return (a->b.MateChrID() < b->b.MateChrID()) || - (a->b.MateChrID() == b->b.MateChrID() && - a->b.MatePosition() < b->b.MatePosition()); - }); - - for (const svabaRead* r : cluster) { - if (r->dd <= 0) - continue; - - if (!r->MateReverseFlag()) - __add_read_to_cluster(fwd, this_fwd, r, true); - else - __add_read_to_cluster(rev, this_rev, r, true); - } - - if (!this_fwd.empty()) - fwd.push_back(this_fwd); - if (!this_rev.empty()) - rev.push_back(this_rev); - } -} -*/ void DiscordantCluster::__cluster_mate_reads(svabaReadClusterVector& brcv, svabaReadClusterVector& fwd, svabaReadClusterVector& rev) @@ -584,9 +473,9 @@ void DiscordantCluster::__cluster_mate_reads(svabaReadClusterVector& brcv, } // finish the last clusters - if (__valid_cluster(this_fwd, true) & this_fwd.size() >= MIN_PER_CLUST) + if (__valid_cluster(this_fwd, true) && this_fwd.size() >= MIN_PER_CLUST) fwd.push_back(this_fwd); - if (__valid_cluster(this_rev, true) & this_rev.size() >= MIN_PER_CLUST) + if (__valid_cluster(this_rev, true) && this_rev.size() >= MIN_PER_CLUST) rev.push_back(this_rev); } } @@ -661,9 +550,9 @@ bool DiscordantCluster::__valid_cluster(svabaReadPtrVector& clust, bool ismate) // check if getting too wide bool too_wide; if (ismate) - too_wide = (clust.size() > 1 && (clust.back()->Position() - clust[0]->Position()) > 3000); + too_wide = (clust.size() > 1 && (clust.back()->MatePosition() - clust[0]->MatePosition()) > MAX_CLUSTER_WIDTH); else - too_wide = (clust.size() > 1 && (clust.back()->Position() - clust[0]->Position()) > 3000); + too_wide = (clust.size() > 1 && (clust.back()->Position() - clust[0]->Position()) > MAX_CLUSTER_WIDTH); return !too_wide; } @@ -688,6 +577,12 @@ void DiscordantCluster::__convertToDiscordantCluster(DiscordantClusterMap &dd, for (auto& v : cvec) { DiscordantCluster d(v, bav, header); dd[d.m_id] = d; + + // Warn if widths are too large + if (d.m_reg1.Width() >= MAX_REGION_WIDTH || d.m_reg2.Width() >= MAX_REGION_WIDTH) { + std::cerr << "Warning: Wide cluster " << d.m_id << " with region widths " + << d.m_reg1.Width() << "bp and " << d.m_reg2.Width() << "bp (>= " << MAX_REGION_WIDTH << "bp limit)" << std::endl; + } } }