diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f206702b4..401ee1eb5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,14 @@ dcicutils Change Log ---------- +8.18.4 +====== +* sarahgonicholson / 2025-08-12 / branch: sn_update_eqm_config / PR-329 + - Update the config file `submitr/config/custom_column_mappings.json` to include properties for DSA as well as DuplexSeq ExternalQualityMetric items + - Fix handling of empty properties in ExternalQualityMetric spreadsheets + - Update the handling of `CUSTOM_COLUMN_MAPPINGS_LOCAL` to use the local version of `custom_column_mappings` in `submitr` (rather than in `dcicutils/submitr`) when set to `True` to help with testing + + 8.18.3 ====== * dmichaels / 2025-03-05 / branch: dmichaels-20250305-add-portal-get-schema-super-types / PR-328 diff --git a/dcicutils/submitr/config/custom_column_mappings.json b/dcicutils/submitr/config/custom_column_mappings.json index cb9fca5a3..e80d408ab 100644 --- a/dcicutils/submitr/config/custom_column_mappings.json +++ b/dcicutils/submitr/config/custom_column_mappings.json @@ -1,11 +1,11 @@ { - "version": "1.0.0", + "version": "2.0.0", "sheet_mappings": { - "ExternalQualityMetric": "external_quality_metric", - "AnotherTypeForCustomColumnMappingsHere": "another_custom_column_mappings_here" + "DuplexSeq_ExternalQualityMetric": "duplexseq_external_quality_metric", + "DSA_ExternalQualityMetric": "dsa_external_quality_metric" }, "column_mappings": { - "external_quality_metric": { + "duplexseq_external_quality_metric": { "total_raw_reads_sequenced": { "qc_values#.derived_from": "{name}", "qc_values#.value": "{value:integer}", @@ -52,7 +52,43 @@ "qc_values#.derived_from": "{name}", "qc_values#.value": "{value:integer}", "qc_values#.key": "Number of Final Post-filtering Consensus Interrogated Base Pairs", - "qc_values#.tooltip": "After applying all filters for variant calling, e.e. Mapping quality, Low complexity regions, a4s2 duplex reconstruction criteria, etc." + "qc_values#.tooltip": "After applying all filters for variant calling, e.g. Mapping quality, Low complexity regions, a4s2 duplex reconstruction criteria, etc." + }, + "genome_coverage_1x": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value:float}", + "qc_values#.key": "Fraction of Genome Coverage at Specific Depths Post-filtering >=1X", + "qc_values#.tooltip": null + }, + "genome_coverage_2x": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value:float}", + "qc_values#.key": "Fraction of Genome Coverage at Specific Depths Post-filtering >=2X", + "qc_values#.tooltip": "Genomic coverage of a4s2 duplexes (%). To estimate the bias introduced by genomic region" + }, + "genome_coverage_3x": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value:float}", + "qc_values#.key": "Fraction of Genome Coverage at Specific Depths Post-filtering >=3X", + "qc_values#.tooltip": null + }, + "genome_coverage_4x": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value:float}", + "qc_values#.key": "Fraction of Genome Coverage at Specific Depths Post-filtering >=4X", + "qc_values#.tooltip": null + }, + "germline_homozygous_snv_count_by_molecule": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value:integer}", + "qc_values#.key": "Germline Homozygous SNV Count by Molecule", + "qc_values#.tooltip": null + }, + "germline_heterozygous_snv_count_by_molecule": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value:integer}", + "qc_values#.key": "Germline Heterozygous SNV Count by Molecule", + "qc_values#.tooltip": null }, "somatic_snv_count_by_molecule": { "qc_values#.derived_from": "{name}", @@ -60,19 +96,175 @@ "qc_values#.key": "Somatic SNV Count by Molecule", "qc_values#.tooltip": null }, + "snv_detection_rate_by_molecule": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value:float}", + "qc_values#.key": "SNV Detection Rate by Molecule", + "qc_values#.tooltip": "Detected germline mutations / total germline mutations" + }, "snv_mutation_burden_by_molecule": { "qc_values#.derived_from": "{name}", "qc_values#.value": "{value:float}", "qc_values#.key": "Somatic SNV Mutation Burden by Molecule", "qc_values#.tooltip": "Detected somatic mutation / final consensus interrogated base pairs" + }, + "germline_homozygous_snv_count_by_allele": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value:integer}", + "qc_values#.key": "Germline Homozygous SNV Count by Allele", + "qc_values#.tooltip": "null" + }, + "germline_heterozygous_snv_count_by_allele": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value:integer}", + "qc_values#.key": "Germline Heterozygous SNV Count by Allele", + "qc_values#.tooltip": "null" + }, + "somatic_snv_count_by_allele": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value:integer}", + "qc_values#.key": "Somatic SNV Count by Allele", + "qc_values#.tooltip": null + }, + "snv_mutation_burden_by_allele": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value:float}", + "qc_values#.key": "Somatic SNV Mutation Burden by Allele", + "qc_values#.tooltip": "Detected somatic mutation / final consensus interrogated base pairs" + }, + "germline_indel_count_by_molecule": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value:integer}", + "qc_values#.key": "Germline Indel Count by Molecule", + "qc_values#.tooltip": null + }, + "somatic_indel_count_by_molecule": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value:integer}", + "qc_values#.key": "Somatic Indel Count by Molecule", + "qc_values#.tooltip": null + }, + "indel_mutation_burden_by_molecule": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value:integer}", + "qc_values#.key": "Somatic Indel Mutation Burden by Molecule", + "qc_values#.tooltip": "Detected somatic indels / detection rate (i.e. detected somatic indels/ detected germline indels * total germline indels)" + }, + "germline_indel_count_by_allele": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value:integer}", + "qc_values#.key": "Germline Indel Count by Allele", + "qc_values#.tooltip": null + }, + "somatic_indel_count_by_allele": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value:integer}", + "qc_values#.key": "Somatic Indel Count by Allele", + "qc_values#.tooltip": null + }, + "indel_mutation_burden_by_allele": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value:float}", + "qc_values#.key": "Somatic Indel Mutation Burden by Allele", + "qc_values#.tooltip": null } }, - "another_custom_column_mappings_here": { - "your_custom_column_name": { + "dsa_external_quality_metric": { + "contig_l50": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value: integer}", + "qc_values#.key": "Contig L50", + "qc_values#.tooltip": "L50 of contigs: the count of the smallest number of contigs whose total length makes up 50% of the assembly size" + }, + "contig_n50": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value: integer}", + "qc_values#.key": "Contig N50", + "qc_values#.tooltip": "N50 of contigs: the sequence length of the shortest contig at 50% of the total assembly length (bp)" + }, + "gaps_between_scaffolds": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value: integer}", + "qc_values#.key": "Gaps Between Scaffolds", + "qc_values#.tooltip": "Number of gaps between scaffolds" + }, + "gc_content": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value: float}", + "qc_values#.key": "GC Content", + "qc_values#.tooltip": "GC content of assembly excluding N (%)" + }, + "largest_contig_size": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value: integer}", + "qc_values#.key": "Largest Contig Size", + "qc_values#.tooltip": "Size of the largest contig in the assembly (bp)" + }, + "number_of_contigs": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value: integer}", + "qc_values#.key": "Number of Contigs", + "qc_values#.tooltip": "Number of contigs in the assembly" + }, + "number_of_scaffolds": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value: integer}", + "qc_values#.key": "Number of Scaffolds", + "qc_values#.tooltip": "Number of scaffolds in the assembly" + }, + "percent_single_copy": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value: float}", + "qc_values#.key": "Percent Single Copy", + "qc_values#.tooltip": "Percentage of highly conserved genes with a single copy present in the assembly (BUSCO)" + }, + "percent_multi_copy": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value: float}", + "qc_values#.key": "Percent Multiple Copy", + "qc_values#.tooltip": "Percentage of highly conserved genes with multiple copies present in the assembly (BUSCO)" + }, + "percent_fragmented": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value: float}", + "qc_values#.key": "Percent Fragmented", + "qc_values#.tooltip": "Percentage of highly conserved genes with fragmented representation in the assembly (BUSCO)" + }, + "percent_missing": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value: float}", + "qc_values#.key": "Percent Missing", + "qc_values#.tooltip": "Percentage of highly conserved genes that are missing from the assembly (BUSCO)" + }, + "quality_value": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value: float}", + "qc_values#.key": "Quality Value", + "qc_values#.tooltip": "Consensus quality value (QV) of the assembly (from Merqury)" + }, + "scaffold_l50": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value: integer}", + "qc_values#.key": "Contig L50", + "qc_values#.tooltip": "L50 of scaffolds: the count of the smallest number of scaffolds whose total length makes up 50% of the assembly size" + }, + "scaffold_n50": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value: integer}", + "qc_values#.key": "Scaffold N50", + "qc_values#.tooltip": "N50 of scaffolds: the sequence length of the shortest scaffold at 50% of the total assembly length (bp)" + }, + "total_ungapped_length": { + "qc_values#.derived_from": "{name}", + "qc_values#.value": "{value: integer}", + "qc_values#.key": "Total Ungapped Length", + "qc_values#.tooltip": "Total ungapped length of the assembly (bp)" + }, + "number_of_chromosomes": { "qc_values#.derived_from": "{name}", - "qc_values#.value": "{value}", - "qc_values#.key": "Your key for this column mapping", - "qc_values#.tooltip": "Your tooltip text for this column mapping" + "qc_values#.value": "{value: integer}", + "qc_values#.key": "Number of Chromosomes", + "qc_values#.tooltip": "Number of chromosomes in the assembly" } } } diff --git a/dcicutils/submitr/custom_excel.py b/dcicutils/submitr/custom_excel.py index 9f4ea5291..df40944af 100644 --- a/dcicutils/submitr/custom_excel.py +++ b/dcicutils/submitr/custom_excel.py @@ -115,7 +115,9 @@ def fetch_custom_column_mappings(): if not custom_column_mappings: # Fallback to the actual config file in this package. try: - file = os.path.join(os.path.dirname(__file__), "config", "custom_column_mappings.json") + # file = os.path.join(os.path.dirname(__file__), "config", "custom_column_mappings.json") + file = os.path.join("submitr", "config", "custom_column_mappings.json") + # for testing locally in submitr with io.open(file, "r") as f: custom_column_mappings = json.load(f) except Exception: @@ -260,7 +262,7 @@ def _iter_mapper(self, row: dict) -> List[str]: @staticmethod def _parse_value_specifier(value_specifier: Optional[Any], value: Optional[Any]) -> Optional[Any]: - if value is not None: + if value: if isinstance(value_specifier, str) and (value_specifier := value_specifier.replace(" ", "")): if value_specifier.startswith("{value"): if (value_specifier[len(value_specifier) - 1] == "}"): diff --git a/pyproject.toml b/pyproject.toml index 6e2a22878..7e62db89c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicutils" -version = "8.18.3" +version = "8.18.4" description = "Utility package for interacting with the 4DN Data Portal and other 4DN resources" authors = ["4DN-DCIC Team "] license = "MIT"