diff --git a/panpipes/panpipes/pipeline_ingest.py b/panpipes/panpipes/pipeline_ingest.py index 1f6f390f..2be1d913 100644 --- a/panpipes/panpipes/pipeline_ingest.py +++ b/panpipes/panpipes/pipeline_ingest.py @@ -358,6 +358,11 @@ def run_scrublet(infile, outfile, sample_id): --inputpath %(infile)s --outdir %(outdir)s """ + if PARAMS['scr_per_sample']: + if PARAMS['scr_expected_doublet_rate_csv'] is None: + raise ValueError("scr.per_sample is True but scr.expected_doublet_rate_csv is not specified in pipeline.yml") + cmd += " --expected_doublet_rate_csv %(scr_expected_doublet_rate_csv)s" + # always pass the default expected doublet rate, the python script will prioritize the csv if it exists if PARAMS['scr_expected_doublet_rate'] is not None: cmd += " --expected_doublet_rate %(scr_expected_doublet_rate)s" if PARAMS['scr_sim_doublet_ratio'] is not None: diff --git a/panpipes/panpipes/pipeline_ingest/pipeline.yml b/panpipes/panpipes/pipeline_ingest/pipeline.yml index c5ee70b5..b1b88a48 100644 --- a/panpipes/panpipes/pipeline_ingest/pipeline.yml +++ b/panpipes/panpipes/pipeline_ingest/pipeline.yml @@ -65,6 +65,12 @@ plot_10X_metrics: True # Doublet detection on RNA modality scr: run: True + # If per_sample is False or the sample_id is missing from the CSV, + # use this default expected doublet rate. + per_sample: False + expected_doublet_rate_csv: + # If per_sample is False or the sample_id is missing from the CSV, + # use this default expected doublet rate. expected_doublet_rate: 0.06 sim_doublet_ratio: 2 n_neighbours: 20 diff --git a/panpipes/python_scripts/run_scrublet_scores.py b/panpipes/python_scripts/run_scrublet_scores.py index fa5a9ebb..64e5d6b8 100644 --- a/panpipes/python_scripts/run_scrublet_scores.py +++ b/panpipes/python_scripts/run_scrublet_scores.py @@ -35,6 +35,9 @@ parser.add_argument("--expected_doublet_rate", default=0.06, help="the expected fraction of transcriptomes that are doublets, typically 0.05-0.1. Results are not particularly sensitive to this parameter") +parser.add_argument("--expected_doublet_rate_csv", + default=None, + help="Path to a CSV file with 'sample_id' and 'expected_doublet_rate' columns for sample-specific rates.") parser.add_argument("--sim_doublet_ratio", default=2, help="the number of doublets to simulate, relative to the number of observed transcriptomes. Setting too high is computationally expensive. Min tested 0.5") @@ -68,15 +71,37 @@ L.info("Reading in data from '%s'" % input) adata = mu.read(args.inputpath + "/rna") +# Determine the expected doublet rate. Start with the default. +expected_doublet_rate = float(args.expected_doublet_rate) +# If a CSV is provided, try to find a sample-specific rate. +if args.expected_doublet_rate_csv is not None and os.path.exists(args.expected_doublet_rate_csv): + L.info("Reading sample-specific doublet rates from %s", args.expected_doublet_rate_csv) + try: + rates_df = pd.read_csv(args.expected_doublet_rate_csv) + if not all(col in rates_df.columns for col in ['sample_id', 'expected_doublet_rate']): + raise ValueError("CSV must contain 'sample_id' and 'expected_doublet_rate' columns.") + + sample_rate_series = rates_df.loc[rates_df['sample_id'] == args.sample_id, 'expected_doublet_rate'] + + if sample_rate_series.empty: + L.warning("sample_id '%s' not found in %s. Falling back to default rate %f", args.sample_id, args.expected_doublet_rate_csv, expected_doublet_rate) + else: + expected_doublet_rate = float(sample_rate_series.iloc[0]) + L.info("Using sample-specific expected_doublet_rate %f for sample %s", expected_doublet_rate, args.sample_id) + + except Exception as e: + L.error("Failed to read or parse expected doublet rate from CSV: %s. Falling back to default rate %f.", e, expected_doublet_rate) +else: + L.info("Using default expected_doublet_rate: %f", expected_doublet_rate) counts_matrix = adata.X L.info('Counts matrix shape: {} rows, {} columns'.format(counts_matrix.shape[0], counts_matrix.shape[1])) L.info('Number of genes in gene list: {}'.format(len(adata.var_names))) -L.info("Initializing the scrublet object with expected_doublet_rate: %s" % args.expected_doublet_rate) +L.info("Initializing the scrublet object with expected_doublet_rate: %s" % expected_doublet_rate) -scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=float(args.expected_doublet_rate)) +scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=expected_doublet_rate) L.info("Predicting doublets with params: \nmincells: %s \nmingenes: %s \nmin_gene_variabilty_pctl: %s\nn_prin_comps: %s\n" % (args.min_counts, args.min_cells, args.min_gene_variability_pctl, args.n_prin_comps)) L.info("Predicting using:\n") diff --git a/panpipes/resources/sample_expected_doublet_rates.csv b/panpipes/resources/sample_expected_doublet_rates.csv new file mode 100644 index 00000000..2f34fd84 --- /dev/null +++ b/panpipes/resources/sample_expected_doublet_rates.csv @@ -0,0 +1,3 @@ +sample_id,expected_doublet_rate +Sample1,0.10 +Sample2,0.7 \ No newline at end of file