Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 40 additions & 145 deletions src/svaba/DiscordantCluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

#define DISC_PAD 150
#define MIN_PER_CLUST 2
#define MAX_CLUSTER_WIDTH 3000
#define MAX_REGION_WIDTH 5000
#define DEFAULT_ISIZE_THRESHOLD 2000 // shouldn't be hit if isize was learned

//#define DEBUG_CLUSTER 1
Expand Down Expand Up @@ -154,28 +156,35 @@ DiscordantClusterMap DiscordantCluster::clusterReads(svabaReadPtrVector& bav,
SeqLib::Orientation por;
for (const auto& [_, read] : reads)
{
por = read->PairOrientation();
m_reg1.strand = read->ReverseFlag() ? '-' : '+';
m_reg2.strand = read->MateReverseFlag() ? '-' : '+';
m_reg1.chr = read->ChrID();
m_reg2.chr = read->MateChrID();

// get left side
if (read->Position() < m_reg1.pos1)
m_reg1.pos1 = read->Position();
if (read->MatePosition() < m_reg2.pos1)
m_reg2.pos1 = read->MatePosition();

// get right side
if (read->PositionEnd() > m_reg1.pos2)
m_reg1.pos2 = read->PositionEnd();
if (read->MatePosition() > m_reg2.pos2)
m_reg2.pos2 = read->MatePosition() + read->Length(); // since don't have mate end

assert(m_reg1.Width() < 5000);
assert(m_reg2.Width() < 5000);
por = read->PairOrientation();
m_reg1.strand = read->ReverseFlag() ? '-' : '+';
m_reg2.strand = read->MateReverseFlag() ? '-' : '+';
m_reg1.chr = read->ChrID();
m_reg2.chr = read->MateChrID();

// get left side
if (read->Position() < m_reg1.pos1)
m_reg1.pos1 = read->Position();
if (read->MatePosition() < m_reg2.pos1)
m_reg2.pos1 = read->MatePosition();

// get right side
if (read->PositionEnd() > m_reg1.pos2)
m_reg1.pos2 = read->PositionEnd();
if (read->MatePosition() > m_reg2.pos2)
m_reg2.pos2 = read->MatePosition() + read->Length(); // since don't have mate end

// Check region width constraints after building regions
if (m_reg1.Width() >= MAX_REGION_WIDTH) {
std::cerr << "Warning: Region 1 width (" << m_reg1.Width() << "bp) exceeds " << MAX_REGION_WIDTH << "bp limit. Cluster is too large." << std::endl;
}
if (m_reg2.Width() >= MAX_REGION_WIDTH) {
std::cerr << "Warning: Region 2 width (" << m_reg2.Width() << "bp) exceeds " << MAX_REGION_WIDTH << "bp limit. Cluster is too large." << std::endl;
}
assert(m_reg1.Width() < 5000);
assert(m_reg2.Width() < 5000);
}

mapq1 = __getMeanMapq(reads);
mapq2 = __getMeanMapq(mates);
assert(mapq1 >= 0);
Expand Down Expand Up @@ -433,126 +442,6 @@ bool DiscordantCluster::__add_read_to_cluster(svabaReadClusterVector& cvec,
}
}

/*
bool DiscordantCluster::__add_read_to_cluster(svabaReadClusterVector &cvec,
svabaReadVector &clust,
const svabaRead &a,
bool mate) {

// get the position of the previous read. If none, we're starting a new one so make a dummy
std::pair<int,int> last_info;
if (clust.size() == 0)
last_info = {-1, -1};
else if (mate)
last_info = {clust.back().MateChrID(), clust.back().MatePosition()};
else
last_info = {clust.back().ChrID(), clust.back().Position()};

// get the position of the current read
std::pair<int,int> this_info;
if (mate)
this_info = {a.MateChrID(), a.MatePosition()};
else
this_info = {a.ChrID(), a.Position()};

// is this cluster too big? happens if too many discordant reads. Enforce a hard cutoff
bool too_big;
if (mate)
too_big = (clust.size() > 1 && (clust.back().MatePosition() - clust[0].MatePosition()) > 3000);
else
too_big = (clust.size() > 1 && (clust.back().Position() - clust[0].Position()) > 3000);

// note to self. this rarely gets hit?
//if (too_big)
// std::cerr << "Cluster too big at " << clust[0].Brief() << " to " << clust.back().Brief() << ". Breaking off after 3000bp" << std::endl;

// check if this read is close enough to the last
if (!too_big && (this_info.first == last_info.first) && (this_info.second - last_info.second) <= DISC_PAD) {

// read belongs in current cluster, so add
clust.push_back(a);
last_info = this_info;
return true;

// read does not belong to cluster. close this cluster and add to cvec
} else {

// if enough supporting reads, add as a cluster
if (clust.size() >= MIN_PER_CLUST) {
cvec.push_back(clust);
}

// clear this cluster and start a new one
clust.clear();
clust.push_back(a);

return false;
}
}


void DiscordantCluster::__cluster_mate_reads(svabaReadClusterVector& brcv, svabaReadClusterVector& fwd, svabaReadClusterVector& rev)
{
// loop through the clusters, and cluster within clusters based on mate read
for (auto& v : brcv)
{

svabaReadVector this_fwd, this_rev;
std::sort(v.begin(), v.end(), SeqLib::BamRecordSort::ByMatePosition());

for (auto& r : v)
{

// not a discordant read, ignore it
if (r.dd <= 0)
continue;

// forward clustering
if (!r.MateReverseFlag())
__add_read_to_cluster(fwd, this_fwd, r, true);
// reverse clustering
else if (r.MateReverseFlag())
__add_read_to_cluster(rev, this_rev, r, true);

}
// finish the last clusters
if (this_fwd.size() > 0)
fwd.push_back(this_fwd);
if (this_rev.size() > 0)
rev.push_back(this_rev);
} // finish main cluster loop
}
void DiscordantCluster::__cluster_mate_reads(std::vector<std::vector<const svabaRead*>>& brcv,
std::vector<std::vector<const svabaRead*>>& fwd,
std::vector<std::vector<const svabaRead*>>& rev)
{
for (auto& cluster : brcv) {
std::vector<const svabaRead*> this_fwd, this_rev;

std::sort(cluster.begin(), cluster.end(),
[](const svabaRead* a, const svabaRead* b) {
return (a->b.MateChrID() < b->b.MateChrID()) ||
(a->b.MateChrID() == b->b.MateChrID() &&
a->b.MatePosition() < b->b.MatePosition());
});

for (const svabaRead* r : cluster) {
if (r->dd <= 0)
continue;

if (!r->MateReverseFlag())
__add_read_to_cluster(fwd, this_fwd, r, true);
else
__add_read_to_cluster(rev, this_rev, r, true);
}

if (!this_fwd.empty())
fwd.push_back(this_fwd);
if (!this_rev.empty())
rev.push_back(this_rev);
}
}
*/
void DiscordantCluster::__cluster_mate_reads(svabaReadClusterVector& brcv,
svabaReadClusterVector& fwd,
svabaReadClusterVector& rev)
Expand Down Expand Up @@ -584,9 +473,9 @@ void DiscordantCluster::__cluster_mate_reads(svabaReadClusterVector& brcv,
}

// finish the last clusters
if (__valid_cluster(this_fwd, true) & this_fwd.size() >= MIN_PER_CLUST)
if (__valid_cluster(this_fwd, true) && this_fwd.size() >= MIN_PER_CLUST)
fwd.push_back(this_fwd);
if (__valid_cluster(this_rev, true) & this_rev.size() >= MIN_PER_CLUST)
if (__valid_cluster(this_rev, true) && this_rev.size() >= MIN_PER_CLUST)
rev.push_back(this_rev);
}
}
Expand Down Expand Up @@ -661,9 +550,9 @@ bool DiscordantCluster::__valid_cluster(svabaReadPtrVector& clust, bool ismate)
// check if getting too wide
bool too_wide;
if (ismate)
too_wide = (clust.size() > 1 && (clust.back()->Position() - clust[0]->Position()) > 3000);
too_wide = (clust.size() > 1 && (clust.back()->MatePosition() - clust[0]->MatePosition()) > MAX_CLUSTER_WIDTH);
else
too_wide = (clust.size() > 1 && (clust.back()->Position() - clust[0]->Position()) > 3000);
too_wide = (clust.size() > 1 && (clust.back()->Position() - clust[0]->Position()) > MAX_CLUSTER_WIDTH);

return !too_wide;
}
Expand All @@ -688,6 +577,12 @@ void DiscordantCluster::__convertToDiscordantCluster(DiscordantClusterMap &dd,
for (auto& v : cvec) {
DiscordantCluster d(v, bav, header);
dd[d.m_id] = d;

// Warn if widths are too large
if (d.m_reg1.Width() >= MAX_REGION_WIDTH || d.m_reg2.Width() >= MAX_REGION_WIDTH) {
std::cerr << "Warning: Wide cluster " << d.m_id << " with region widths "
<< d.m_reg1.Width() << "bp and " << d.m_reg2.Width() << "bp (>= " << MAX_REGION_WIDTH << "bp limit)" << std::endl;
}
}
}

Expand Down