Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions lib/Bio/EnsEMBL/DataCheck/Checks/DisplayableSampleGene.pm
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@ sub tests {
my $desc_1 = 'Sample gene has displayable analysis';
my $diag_1 = 'Undisplayed analysis';
my $sql_1 = qq/
SELECT gene_id
SELECT gene_id
FROM gene g
INNER JOIN meta m
ON g.stable_id = m.meta_value
AND m.meta_key = 'sample.gene_param'
INNER JOIN meta m
ON g.stable_id = m.meta_value
AND m.meta_key = 'genebuild.sample_gene'
INNER JOIN analysis a ON g.analysis_id = a.analysis_id
INNER JOIN analysis_description ad
INNER JOIN analysis_description ad
ON g.analysis_id = ad.analysis_id AND ad.displayable = 0
/;

Expand All @@ -58,13 +58,13 @@ sub tests {
my $desc_2 = 'Sample gene has associated web_data';
my $diag_2 = 'web_data is not set';
my $sql_2 = qq/
SELECT gene_id
SELECT gene_id
FROM gene g
INNER JOIN meta m
ON g.stable_id = m.meta_value
AND m.meta_key = 'sample.gene_param'
INNER JOIN meta m
ON g.stable_id = m.meta_value
AND m.meta_key = 'genebuild.sample_gene'
INNER JOIN analysis a ON g.analysis_id = a.analysis_id
INNER JOIN analysis_description ad
INNER JOIN analysis_description ad
ON g.analysis_id = ad.analysis_id AND ad.web_data IS NULL
/;

Expand Down
117 changes: 117 additions & 0 deletions lib/Bio/EnsEMBL/DataCheck/Checks/DisplayableSampleLocation.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
=head1 LICENSE

Copyright [2018-2025] EMBL-European Bioinformatics Institute

Licensed under the Apache License, Version 2.0 (the 'License');
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an 'AS IS' BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=cut

package Bio::EnsEMBL::DataCheck::Checks::DisplayableSampleLocation;

use warnings;
use strict;

use Moose;
use Test::More;
use Bio::EnsEMBL::DataCheck::Test::DataCheck;

extends 'Bio::EnsEMBL::DataCheck::DbCheck';

use constant {
NAME => 'DisplayableSampleLocation',
DESCRIPTION => 'Sample location is displayable and has web_data attached to its analysis',
GROUPS => ['analysis_description', 'core', 'geneset', 'meta_sample'],
DB_TYPES => ['core'],
TABLES => ['gene', 'meta', 'seq_region']
};

sub tests {
my ($self) = @_;

my $species_id = $self->dba->species_id;

my $desc_1 = 'Sample location metadata exists exactly once';
my $diag_1 = 'genebuild.sample_location meta key should exist exactly once per species';
my $sql_1 = qq/
SELECT COUNT(*) AS count
FROM meta
WHERE meta_key = 'genebuild.sample_location'
AND species_id = $species_id
HAVING count != 1
/;

is_rows_zero($self->dba, $sql_1, $desc_1, $diag_1);

my $desc_2 = 'Sample location metadata is valid format';
my $diag_2 = 'genebuild.sample_location format is invalid';
my $sql_2 = qq/
SELECT meta_id
FROM meta
WHERE meta_key = 'genebuild.sample_location'
AND species_id = $species_id
AND meta_value NOT REGEXP '^.+:[0-9]+-[0-9]+\$'
/;

is_rows_zero($self->dba, $sql_2, $desc_2, $diag_2);

my $desc_3 = 'Sample location coordinates are properly ordered';
my $diag_3 = 'genebuild.sample_location start coordinate is greater than end coordinate';
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since I'm not too familiar, worth asking: are we always choosing samples/coordinates from the "positive" strand?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that even if the target feature is on reverse strand the the coordinates need to be in this order :-.

I basically just tried flipping them on one example location and the browser complained so I made it a DC.

my $sql_3 = qq/
SELECT meta_id
FROM meta
WHERE meta_key = 'genebuild.sample_location'
AND species_id = $species_id
AND CAST(SUBSTRING_INDEX(SUBSTRING_INDEX(meta_value, ':', -1), '-', 1) AS UNSIGNED) >
CAST(SUBSTRING_INDEX(meta_value, '-', -1) AS UNSIGNED)
/;

is_rows_zero($self->dba, $sql_3, $desc_3, $diag_3);

my $desc_4 = 'Sample location references valid seq_region';
my $diag_4 = 'genebuild.sample_location seq_region does not exist or coordinates out of bounds';
my $sql_4 = qq/
SELECT m.meta_id
FROM meta m
LEFT JOIN seq_region sr
ON SUBSTRING_INDEX(m.meta_value, ':', 1) = sr.name
AND CAST(SUBSTRING_INDEX(SUBSTRING_INDEX(m.meta_value, ':', -1), '-', 1) AS UNSIGNED) >= 1
AND CAST(SUBSTRING_INDEX(m.meta_value, '-', -1) AS UNSIGNED) <= sr.length
WHERE m.meta_key = 'genebuild.sample_location'
AND m.species_id = $species_id
AND sr.seq_region_id IS NULL
/;

is_rows_zero($self->dba, $sql_4, $desc_4, $diag_4);

my $desc_5 = 'Sample location contains at least one gene';
my $diag_5 = 'genebuild.sample_location region has no genes';
my $sql_5 = qq/
SELECT m.meta_id
FROM meta m
JOIN seq_region sr
ON SUBSTRING_INDEX(m.meta_value, ':', 1) = sr.name
LEFT JOIN gene g
ON g.seq_region_id = sr.seq_region_id
AND g.seq_region_start <= CAST(SUBSTRING_INDEX(m.meta_value, '-', -1) AS UNSIGNED)
AND g.seq_region_end >= CAST(SUBSTRING_INDEX(SUBSTRING_INDEX(m.meta_value, ':', -1), '-', 1) AS UNSIGNED)
WHERE m.meta_key = 'genebuild.sample_location'
AND m.species_id = $species_id
AND g.gene_id IS NULL
/;

is_rows_zero($self->dba, $sql_5, $desc_5, $diag_5);

}

1;