Remove bed_reader

benjeffery · benjeffery · commit a56810375681 · 2025-05-22T14:53:45.000+01:00
diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
@@ -1,6 +1,8 @@
 import dataclasses
 import logging
+import os
 import pathlib
+import warnings
 
 import numpy as np
 
@@ -16,87 +18,209 @@ class PlinkPaths:
     fam_path: str
 
 
+@dataclasses.dataclass
+class FamData:
+    sid: np.ndarray
+    sid_count: int
+
+
+@dataclasses.dataclass
+class BimData:
+    chromosome: np.ndarray
+    vid: np.ndarray
+    bp_position: np.ndarray
+    allele_1: np.ndarray
+    allele_2: np.ndarray
+    vid_count: int
+
+
 class PlinkFormat(vcz.Source):
-    @core.requires_optional_dependency("bed_reader", "plink")
     def __init__(self, prefix):
-        import bed_reader
-
         # TODO we will need support multiple chromosomes here to join
         # plinks into on big zarr. So, these will require multiple
         # bed and bim files, but should share a .fam
         self.prefix = str(prefix)
-        paths = PlinkPaths(
+        self.paths = PlinkPaths(
             self.prefix + ".bed",
             self.prefix + ".bim",
             self.prefix + ".fam",
         )
-        self.bed = bed_reader.open_bed(
-            paths.bed_path,
-            bim_location=paths.bim_path,
-            fam_location=paths.fam_path,
-            num_threads=1,
-            count_A1=True,
+
+        # Read sample information from .fam file
+        samples = []
+        with open(self.paths.fam_path) as f:
+            for line in f:
+                fields = line.strip().split()
+                if len(fields) >= 2:  # At minimum, we need FID and IID
+                    samples.append(fields[1])
+        self.fam = FamData(sid=np.array(samples), sid_count=len(samples))
+        self.n_samples = len(samples)
+
+        # Read variant information from .bim file
+        chromosomes = []
+        vids = []
+        positions = []
+        allele1 = []
+        allele2 = []
+
+        with open(self.paths.bim_path) as f:
+            for line in f:
+                fields = line.strip().split()
+                if len(fields) >= 6:
+                    chrom, vid, _, pos, a1, a2 = (
+                        fields[0],
+                        fields[1],
+                        fields[2],
+                        fields[3],
+                        fields[4],
+                        fields[5],
+                    )
+                    chromosomes.append(chrom)
+                    vids.append(vid)
+                    positions.append(int(pos))
+                    allele1.append(a1)
+                    allele2.append(a2)
+
+        self.bim = BimData(
+            chromosome=np.array(chromosomes),
+            vid=np.array(vids),
+            bp_position=np.array(positions),
+            allele_1=np.array(allele1),
+            allele_2=np.array(allele2),
+            vid_count=len(vids),
         )
+        self.n_variants = len(vids)
+
+        # Calculate bytes per SNP: 1 byte per 4 samples, rounded up
+        self.bytes_per_snp = (self.n_samples + 3) // 4
+
+        # Verify BED file has correct magic bytes
+        with open(self.paths.bed_path, "rb") as f:
+            magic = f.read(3)
+            assert magic == b"\x6c\x1b\x01", "Invalid BED file format"
+
+        expected_size = self.n_variants * self.bytes_per_snp + 3  # +3 for magic bytes
+        actual_size = os.path.getsize(self.paths.bed_path)
+        if actual_size < expected_size:
+            raise ValueError(
+                f"BED file is truncated: expected at least {expected_size} bytes, "
+                f"but only found {actual_size} bytes. "
+                f"Check that .bed, .bim, and .fam files match."
+            )
+        elif actual_size > expected_size:
+            # Warn if there's extra data (might indicate file mismatch)
+            warnings.warn(
+                f"BED file contains {actual_size} bytes but only expected "
+                f"{expected_size}. "
+                f"Using first {expected_size} bytes only.",
+                stacklevel=1,
+            )
+
+        # Initialize the lookup table with shape (256, 4, 2)
+        # 256 possible byte values, 4 samples per byte, 2 alleles per sample
+        lookup = np.zeros((256, 4, 2), dtype=np.int8)
+
+        # For each possible byte value (0-255)
+        for byte in range(256):
+            # For each of the 4 samples encoded in this byte
+            for sample in range(4):
+                # Extract the 2 bits for this sample
+                bits = (byte >> (sample * 2)) & 0b11
+                # Convert PLINK's bit encoding to genotype values
+                if bits == 0b00:
+                    lookup[byte, sample] = [1, 1]
+                elif bits == 0b01:
+                    lookup[byte, sample] = [-1, -1]
+                elif bits == 0b10:
+                    lookup[byte, sample] = [0, 1]
+                elif bits == 0b11:
+                    lookup[byte, sample] = [0, 0]
+
+        self.byte_lookup = lookup
 
     @property
     def path(self):
         return self.prefix
 
     @property
     def num_records(self):
-        return self.bed.sid_count
+        return self.bim.vid_count
 
     @property
     def samples(self):
-        return [vcz.Sample(id=sample) for sample in self.bed.iid]
+        return [vcz.Sample(id=sample) for sample in self.fam.sid]
 
     @property
     def contigs(self):
-        return [vcz.Contig(id=str(chrom)) for chrom in np.unique(self.bed.chromosome)]
+        return [vcz.Contig(id=str(chrom)) for chrom in np.unique(self.bim.chromosome)]
 
     @property
     def num_samples(self):
         return len(self.samples)
 
     def iter_contig(self, start, stop):
         chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)}
-        for chrom in self.bed.chromosome[start:stop]:
+        for chrom in self.bim.chromosome[start:stop]:
             yield chrom_to_contig_index[str(chrom)]
 
     def iter_field(self, field_name, shape, start, stop):
         assert field_name == "position"  # Only position field is supported from plink
-        yield from self.bed.bp_position[start:stop]
+        yield from self.bim.bp_position[start:stop]
 
     def iter_id(self, start, stop):
-        yield from self.bed.sid[start:stop]
+        yield from self.bim.vid[start:stop]
 
     def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
-        alt_field = self.bed.allele_1
-        ref_field = self.bed.allele_2
-        bed_chunk = self.bed.read(slice(start, stop), dtype=np.int8).T
-        gt = np.zeros(shape, dtype=np.int8)
-        phased = np.zeros(shape[:-1], dtype=bool)
+        alt_field = self.bim.allele_1
+        ref_field = self.bim.allele_2
+
+        chunk_size = stop - start
+
+        # Calculate file offsets for the required data
+        # 3 bytes for the magic number at the beginning of the file
+        start_offset = 3 + (start * self.bytes_per_snp)
+        bytes_to_read = chunk_size * self.bytes_per_snp
+
+        # Read only the needed portion of the BED file
+        with open(self.paths.bed_path, "rb") as f:
+            f.seek(start_offset)
+            chunk_data = f.read(bytes_to_read)
+
+        data_bytes = np.frombuffer(chunk_data, dtype=np.uint8)
+        data_matrix = data_bytes.reshape(chunk_size, self.bytes_per_snp)
+
+        # Apply lookup table to get genotypes
+        # Shape becomes: (chunk_size, bytes_per_snp, 4, 2)
+        all_genotypes = self.byte_lookup[data_matrix]
+
+        # Reshape to get all samples in one dimension
+        # (chunk_size, bytes_per_snp*4, 2)
+        samples_padded = self.bytes_per_snp * 4
+        genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2)
+
+        gt = genotypes_reshaped[:, : self.n_samples]
+
+        phased = np.zeros((chunk_size, self.n_samples), dtype=bool)
+
         for i, (ref, alt) in enumerate(
             zip(ref_field[start:stop], alt_field[start:stop])
         ):
             alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
             alleles[0] = ref
             alleles[1 : 1 + len(alt)] = alt
-            gt[:] = 0
-            gt[bed_chunk[i] == -127] = -1
-            gt[bed_chunk[i] == 2] = 1
-            gt[bed_chunk[i] == 1, 1] = 1
 
             # rlen is the length of the REF in PLINK as there's no END annotations
-            yield vcz.VariantData(len(alleles[0]), alleles, gt, phased)
+            yield vcz.VariantData(
+                len(alleles[0]), alleles, gt[i].copy(), phased[i].copy()
+            )
 
     def generate_schema(
         self,
         variants_chunk_size=None,
         samples_chunk_size=None,
     ):
-        n = self.bed.iid_count
-        m = self.bed.sid_count
+        n = self.fam.sid_count
+        m = self.bim.vid_count
         logging.info(f"Scanned plink with {n} samples and {m} variants")
         dimensions = vcz.standard_dimensions(
             variants_size=m,
@@ -119,7 +243,7 @@ def generate_schema(
         )
         # If we don't have SVLEN or END annotations, the rlen field is defined
         # as the length of the REF
-        max_len = self.bed.allele_2.itemsize
+        max_len = self.bim.allele_2.itemsize
 
         array_specs = [
             vcz.ZarrArraySpec(
@@ -156,7 +280,7 @@ def generate_schema(
             ),
             vcz.ZarrArraySpec(
                 name="variant_contig",
-                dtype=core.min_int_dtype(0, len(np.unique(self.bed.chromosome))),
+                dtype=core.min_int_dtype(0, len(np.unique(self.bim.chromosome))),
                 dimensions=["variants"],
                 description="Contig/chromosome index for each variant",
             ),
diff --git a/tests/test_plink.py b/tests/test_plink.py
@@ -1,5 +1,3 @@
-from unittest import mock
-
 import bed_reader
 import numpy as np
 import numpy.testing as nt
@@ -11,22 +9,6 @@
 from bio2zarr import plink, vcf
 
 
-def test_missing_dependency():
-    with mock.patch(
-        "importlib.import_module",
-        side_effect=ImportError("No module named 'bed_reader'"),
-    ):
-        with pytest.raises(ImportError) as exc_info:
-            plink.convert(
-                "UNUSED_PATH",
-                "UNUSED_PATH",
-            )
-        assert (
-            "This process requires the optional bed_reader module. "
-            "Install it with: pip install bio2zarr[plink]" in str(exc_info.value)
-        )
-
-
 class TestSmallExample:
     @pytest.fixture(scope="class")
     def bed_path(self, tmp_path_factory):