Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions panpipes/panpipes/pipeline_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,11 @@ def run_scrublet(infile, outfile, sample_id):
--inputpath %(infile)s
--outdir %(outdir)s
"""
if PARAMS['scr_per_sample']:
if PARAMS['scr_expected_doublet_rate_csv'] is None:
raise ValueError("scr.per_sample is True but scr.expected_doublet_rate_csv is not specified in pipeline.yml")
cmd += " --expected_doublet_rate_csv %(scr_expected_doublet_rate_csv)s"
# always pass the default expected doublet rate, the python script will prioritize the csv if it exists
if PARAMS['scr_expected_doublet_rate'] is not None:
cmd += " --expected_doublet_rate %(scr_expected_doublet_rate)s"
if PARAMS['scr_sim_doublet_ratio'] is not None:
Expand Down
6 changes: 6 additions & 0 deletions panpipes/panpipes/pipeline_ingest/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@ plot_10X_metrics: True
# Doublet detection on RNA modality
scr:
run: True
# If per_sample is False or the sample_id is missing from the CSV,
# use this default expected doublet rate.
per_sample: False
expected_doublet_rate_csv:
# If per_sample is False or the sample_id is missing from the CSV,
# use this default expected doublet rate.
expected_doublet_rate: 0.06
sim_doublet_ratio: 2
n_neighbours: 20
Expand Down
29 changes: 27 additions & 2 deletions panpipes/python_scripts/run_scrublet_scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@
parser.add_argument("--expected_doublet_rate",
default=0.06,
help="the expected fraction of transcriptomes that are doublets, typically 0.05-0.1. Results are not particularly sensitive to this parameter")
parser.add_argument("--expected_doublet_rate_csv",
default=None,
help="Path to a CSV file with 'sample_id' and 'expected_doublet_rate' columns for sample-specific rates.")
parser.add_argument("--sim_doublet_ratio",
default=2,
help="the number of doublets to simulate, relative to the number of observed transcriptomes. Setting too high is computationally expensive. Min tested 0.5")
Expand Down Expand Up @@ -68,15 +71,37 @@
L.info("Reading in data from '%s'" % input)
adata = mu.read(args.inputpath + "/rna")

# Determine the expected doublet rate. Start with the default.
expected_doublet_rate = float(args.expected_doublet_rate)

# If a CSV is provided, try to find a sample-specific rate.
if args.expected_doublet_rate_csv is not None and os.path.exists(args.expected_doublet_rate_csv):
L.info("Reading sample-specific doublet rates from %s", args.expected_doublet_rate_csv)
try:
rates_df = pd.read_csv(args.expected_doublet_rate_csv)
if not all(col in rates_df.columns for col in ['sample_id', 'expected_doublet_rate']):
raise ValueError("CSV must contain 'sample_id' and 'expected_doublet_rate' columns.")

sample_rate_series = rates_df.loc[rates_df['sample_id'] == args.sample_id, 'expected_doublet_rate']

if sample_rate_series.empty:
L.warning("sample_id '%s' not found in %s. Falling back to default rate %f", args.sample_id, args.expected_doublet_rate_csv, expected_doublet_rate)
else:
expected_doublet_rate = float(sample_rate_series.iloc[0])
L.info("Using sample-specific expected_doublet_rate %f for sample %s", expected_doublet_rate, args.sample_id)

except Exception as e:
L.error("Failed to read or parse expected doublet rate from CSV: %s. Falling back to default rate %f.", e, expected_doublet_rate)
else:
L.info("Using default expected_doublet_rate: %f", expected_doublet_rate)

counts_matrix = adata.X
L.info('Counts matrix shape: {} rows, {} columns'.format(counts_matrix.shape[0], counts_matrix.shape[1]))
L.info('Number of genes in gene list: {}'.format(len(adata.var_names)))

L.info("Initializing the scrublet object with expected_doublet_rate: %s" % args.expected_doublet_rate)
L.info("Initializing the scrublet object with expected_doublet_rate: %s" % expected_doublet_rate)

scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=float(args.expected_doublet_rate))
scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=expected_doublet_rate)
L.info("Predicting doublets with params: \nmincells: %s \nmingenes: %s \nmin_gene_variabilty_pctl: %s\nn_prin_comps: %s\n" % (args.min_counts, args.min_cells, args.min_gene_variability_pctl, args.n_prin_comps))
L.info("Predicting using:\n")

Expand Down
3 changes: 3 additions & 0 deletions panpipes/resources/sample_expected_doublet_rates.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample_id,expected_doublet_rate
Sample1,0.10
Sample2,0.7
Loading