DendrouLab · deevdevil88 · Dec 11, 2025 · Oct 1, 2025 · Nov 7, 2025 · Nov 7, 2025
diff --git a/panpipes/panpipes/pipeline_ingest.py b/panpipes/panpipes/pipeline_ingest.py
@@ -358,6 +358,11 @@ def run_scrublet(infile, outfile, sample_id):
         --inputpath %(infile)s
         --outdir %(outdir)s
         """
+    if PARAMS['scr_per_sample']:
+        if PARAMS['scr_expected_doublet_rate_csv'] is None:
+            raise ValueError("scr.per_sample is True but scr.expected_doublet_rate_csv is not specified in pipeline.yml")
+        cmd += " --expected_doublet_rate_csv %(scr_expected_doublet_rate_csv)s"
+    # always pass the default expected doublet rate, the python script will prioritize the csv if it exists
     if PARAMS['scr_expected_doublet_rate'] is not None:
         cmd += " --expected_doublet_rate %(scr_expected_doublet_rate)s"
     if PARAMS['scr_sim_doublet_ratio'] is not None:

diff --git a/panpipes/panpipes/pipeline_ingest/pipeline.yml b/panpipes/panpipes/pipeline_ingest/pipeline.yml
@@ -65,6 +65,12 @@ plot_10X_metrics: True
 # Doublet detection on RNA modality
 scr:
   run: True
+  # If per_sample is False or the sample_id is missing from the CSV,
+  # use this default expected doublet rate.
+  per_sample: False
+  expected_doublet_rate_csv:
+  # If per_sample is False or the sample_id is missing from the CSV,
+  # use this default expected doublet rate.
   expected_doublet_rate: 0.06
   sim_doublet_ratio: 2
   n_neighbours: 20

diff --git a/panpipes/python_scripts/run_scrublet_scores.py b/panpipes/python_scripts/run_scrublet_scores.py
@@ -35,6 +35,9 @@
 parser.add_argument("--expected_doublet_rate",
                     default=0.06,
                     help="the expected fraction of transcriptomes that are doublets, typically 0.05-0.1. Results are not particularly sensitive to this parameter")
+parser.add_argument("--expected_doublet_rate_csv",
+                    default=None,
+                    help="Path to a CSV file with 'sample_id' and 'expected_doublet_rate' columns for sample-specific rates.")
 parser.add_argument("--sim_doublet_ratio",
                     default=2,
                     help="the number of doublets to simulate, relative to the number of observed transcriptomes. Setting too high is computationally expensive. Min tested 0.5")
@@ -68,15 +71,37 @@
 L.info("Reading in data from '%s'" % input)
 adata = mu.read(args.inputpath + "/rna")
 
+# Determine the expected doublet rate. Start with the default.
+expected_doublet_rate = float(args.expected_doublet_rate)
 
+# If a CSV is provided, try to find a sample-specific rate.
+if args.expected_doublet_rate_csv is not None and os.path.exists(args.expected_doublet_rate_csv):
+    L.info("Reading sample-specific doublet rates from %s", args.expected_doublet_rate_csv)
+    try:
+        rates_df = pd.read_csv(args.expected_doublet_rate_csv)
+        if not all(col in rates_df.columns for col in ['sample_id', 'expected_doublet_rate']):
+            raise ValueError("CSV must contain 'sample_id' and 'expected_doublet_rate' columns.")
+
+        sample_rate_series = rates_df.loc[rates_df['sample_id'] == args.sample_id, 'expected_doublet_rate']
+
+        if sample_rate_series.empty:
+            L.warning("sample_id '%s' not found in %s. Falling back to default rate %f", args.sample_id, args.expected_doublet_rate_csv, expected_doublet_rate)
+        else:
+            expected_doublet_rate = float(sample_rate_series.iloc[0])
+            L.info("Using sample-specific expected_doublet_rate %f for sample %s", expected_doublet_rate, args.sample_id)
+
+    except Exception as e:
+        L.error("Failed to read or parse expected doublet rate from CSV: %s. Falling back to default rate %f.", e, expected_doublet_rate)
+else:
+    L.info("Using default expected_doublet_rate: %f", expected_doublet_rate)
 
 counts_matrix = adata.X
 L.info('Counts matrix shape: {} rows, {} columns'.format(counts_matrix.shape[0], counts_matrix.shape[1]))
 L.info('Number of genes in gene list: {}'.format(len(adata.var_names)))
 
-L.info("Initializing the scrublet object with expected_doublet_rate: %s" % args.expected_doublet_rate)
+L.info("Initializing the scrublet object with expected_doublet_rate: %s" % expected_doublet_rate)
 
-scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=float(args.expected_doublet_rate))
+scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=expected_doublet_rate)
 L.info("Predicting doublets with params: \nmincells: %s \nmingenes: %s \nmin_gene_variabilty_pctl: %s\nn_prin_comps: %s\n" % (args.min_counts, args.min_cells, args.min_gene_variability_pctl, args.n_prin_comps))
 L.info("Predicting using:\n")
 

diff --git a/panpipes/resources/sample_expected_doublet_rates.csv b/panpipes/resources/sample_expected_doublet_rates.csv
@@ -0,0 +1,3 @@
+sample_id,expected_doublet_rate
+Sample1,0.10
+Sample2,0.7