11import dataclasses
22import logging
3+ import os
34import pathlib
5+ import warnings
46
57import numpy as np
68
@@ -16,87 +18,209 @@ class PlinkPaths:
1618 fam_path : str
1719
1820
21+ @dataclasses .dataclass
22+ class FamData :
23+ sid : np .ndarray
24+ sid_count : int
25+
26+
27+ @dataclasses .dataclass
28+ class BimData :
29+ chromosome : np .ndarray
30+ vid : np .ndarray
31+ bp_position : np .ndarray
32+ allele_1 : np .ndarray
33+ allele_2 : np .ndarray
34+ vid_count : int
35+
36+
1937class PlinkFormat (vcz .Source ):
20- @core .requires_optional_dependency ("bed_reader" , "plink" )
2138 def __init__ (self , prefix ):
22- import bed_reader
23-
2439 # TODO we will need support multiple chromosomes here to join
2540 # plinks into on big zarr. So, these will require multiple
2641 # bed and bim files, but should share a .fam
2742 self .prefix = str (prefix )
28- paths = PlinkPaths (
43+ self . paths = PlinkPaths (
2944 self .prefix + ".bed" ,
3045 self .prefix + ".bim" ,
3146 self .prefix + ".fam" ,
3247 )
33- self .bed = bed_reader .open_bed (
34- paths .bed_path ,
35- bim_location = paths .bim_path ,
36- fam_location = paths .fam_path ,
37- num_threads = 1 ,
38- count_A1 = True ,
48+
49+ # Read sample information from .fam file
50+ samples = []
51+ with open (self .paths .fam_path ) as f :
52+ for line in f :
53+ fields = line .strip ().split ()
54+ if len (fields ) >= 2 : # At minimum, we need FID and IID
55+ samples .append (fields [1 ])
56+ self .fam = FamData (sid = np .array (samples ), sid_count = len (samples ))
57+ self .n_samples = len (samples )
58+
59+ # Read variant information from .bim file
60+ chromosomes = []
61+ vids = []
62+ positions = []
63+ allele1 = []
64+ allele2 = []
65+
66+ with open (self .paths .bim_path ) as f :
67+ for line in f :
68+ fields = line .strip ().split ()
69+ if len (fields ) >= 6 :
70+ chrom , vid , _ , pos , a1 , a2 = (
71+ fields [0 ],
72+ fields [1 ],
73+ fields [2 ],
74+ fields [3 ],
75+ fields [4 ],
76+ fields [5 ],
77+ )
78+ chromosomes .append (chrom )
79+ vids .append (vid )
80+ positions .append (int (pos ))
81+ allele1 .append (a1 )
82+ allele2 .append (a2 )
83+
84+ self .bim = BimData (
85+ chromosome = np .array (chromosomes ),
86+ vid = np .array (vids ),
87+ bp_position = np .array (positions ),
88+ allele_1 = np .array (allele1 ),
89+ allele_2 = np .array (allele2 ),
90+ vid_count = len (vids ),
3991 )
92+ self .n_variants = len (vids )
93+
94+ # Calculate bytes per SNP: 1 byte per 4 samples, rounded up
95+ self .bytes_per_snp = (self .n_samples + 3 ) // 4
96+
97+ # Verify BED file has correct magic bytes
98+ with open (self .paths .bed_path , "rb" ) as f :
99+ magic = f .read (3 )
100+ assert magic == b"\x6c \x1b \x01 " , "Invalid BED file format"
101+
102+ expected_size = self .n_variants * self .bytes_per_snp + 3 # +3 for magic bytes
103+ actual_size = os .path .getsize (self .paths .bed_path )
104+ if actual_size < expected_size :
105+ raise ValueError (
106+ f"BED file is truncated: expected at least { expected_size } bytes, "
107+ f"but only found { actual_size } bytes. "
108+ f"Check that .bed, .bim, and .fam files match."
109+ )
110+ elif actual_size > expected_size :
111+ # Warn if there's extra data (might indicate file mismatch)
112+ warnings .warn (
113+ f"BED file contains { actual_size } bytes but only expected "
114+ f"{ expected_size } . "
115+ f"Using first { expected_size } bytes only." ,
116+ stacklevel = 1 ,
117+ )
118+
119+ # Initialize the lookup table with shape (256, 4, 2)
120+ # 256 possible byte values, 4 samples per byte, 2 alleles per sample
121+ lookup = np .zeros ((256 , 4 , 2 ), dtype = np .int8 )
122+
123+ # For each possible byte value (0-255)
124+ for byte in range (256 ):
125+ # For each of the 4 samples encoded in this byte
126+ for sample in range (4 ):
127+ # Extract the 2 bits for this sample
128+ bits = (byte >> (sample * 2 )) & 0b11
129+ # Convert PLINK's bit encoding to genotype values
130+ if bits == 0b00 :
131+ lookup [byte , sample ] = [1 , 1 ]
132+ elif bits == 0b01 :
133+ lookup [byte , sample ] = [- 1 , - 1 ]
134+ elif bits == 0b10 :
135+ lookup [byte , sample ] = [0 , 1 ]
136+ elif bits == 0b11 :
137+ lookup [byte , sample ] = [0 , 0 ]
138+
139+ self .byte_lookup = lookup
40140
41141 @property
42142 def path (self ):
43143 return self .prefix
44144
45145 @property
46146 def num_records (self ):
47- return self .bed . sid_count
147+ return self .bim . vid_count
48148
49149 @property
50150 def samples (self ):
51- return [vcz .Sample (id = sample ) for sample in self .bed . iid ]
151+ return [vcz .Sample (id = sample ) for sample in self .fam . sid ]
52152
53153 @property
54154 def contigs (self ):
55- return [vcz .Contig (id = str (chrom )) for chrom in np .unique (self .bed .chromosome )]
155+ return [vcz .Contig (id = str (chrom )) for chrom in np .unique (self .bim .chromosome )]
56156
57157 @property
58158 def num_samples (self ):
59159 return len (self .samples )
60160
61161 def iter_contig (self , start , stop ):
62162 chrom_to_contig_index = {contig .id : i for i , contig in enumerate (self .contigs )}
63- for chrom in self .bed .chromosome [start :stop ]:
163+ for chrom in self .bim .chromosome [start :stop ]:
64164 yield chrom_to_contig_index [str (chrom )]
65165
66166 def iter_field (self , field_name , shape , start , stop ):
67167 assert field_name == "position" # Only position field is supported from plink
68- yield from self .bed .bp_position [start :stop ]
168+ yield from self .bim .bp_position [start :stop ]
69169
70170 def iter_id (self , start , stop ):
71- yield from self .bed . sid [start :stop ]
171+ yield from self .bim . vid [start :stop ]
72172
73173 def iter_alleles_and_genotypes (self , start , stop , shape , num_alleles ):
74- alt_field = self .bed .allele_1
75- ref_field = self .bed .allele_2
76- bed_chunk = self .bed .read (slice (start , stop ), dtype = np .int8 ).T
77- gt = np .zeros (shape , dtype = np .int8 )
78- phased = np .zeros (shape [:- 1 ], dtype = bool )
174+ alt_field = self .bim .allele_1
175+ ref_field = self .bim .allele_2
176+
177+ chunk_size = stop - start
178+
179+ # Calculate file offsets for the required data
180+ # 3 bytes for the magic number at the beginning of the file
181+ start_offset = 3 + (start * self .bytes_per_snp )
182+ bytes_to_read = chunk_size * self .bytes_per_snp
183+
184+ # Read only the needed portion of the BED file
185+ with open (self .paths .bed_path , "rb" ) as f :
186+ f .seek (start_offset )
187+ chunk_data = f .read (bytes_to_read )
188+
189+ data_bytes = np .frombuffer (chunk_data , dtype = np .uint8 )
190+ data_matrix = data_bytes .reshape (chunk_size , self .bytes_per_snp )
191+
192+ # Apply lookup table to get genotypes
193+ # Shape becomes: (chunk_size, bytes_per_snp, 4, 2)
194+ all_genotypes = self .byte_lookup [data_matrix ]
195+
196+ # Reshape to get all samples in one dimension
197+ # (chunk_size, bytes_per_snp*4, 2)
198+ samples_padded = self .bytes_per_snp * 4
199+ genotypes_reshaped = all_genotypes .reshape (chunk_size , samples_padded , 2 )
200+
201+ gt = genotypes_reshaped [:, : self .n_samples ]
202+
203+ phased = np .zeros ((chunk_size , self .n_samples ), dtype = bool )
204+
79205 for i , (ref , alt ) in enumerate (
80206 zip (ref_field [start :stop ], alt_field [start :stop ])
81207 ):
82208 alleles = np .full (num_alleles , constants .STR_FILL , dtype = "O" )
83209 alleles [0 ] = ref
84210 alleles [1 : 1 + len (alt )] = alt
85- gt [:] = 0
86- gt [bed_chunk [i ] == - 127 ] = - 1
87- gt [bed_chunk [i ] == 2 ] = 1
88- gt [bed_chunk [i ] == 1 , 1 ] = 1
89211
90212 # rlen is the length of the REF in PLINK as there's no END annotations
91- yield vcz .VariantData (len (alleles [0 ]), alleles , gt , phased )
213+ yield vcz .VariantData (
214+ len (alleles [0 ]), alleles , gt [i ].copy (), phased [i ].copy ()
215+ )
92216
93217 def generate_schema (
94218 self ,
95219 variants_chunk_size = None ,
96220 samples_chunk_size = None ,
97221 ):
98- n = self .bed . iid_count
99- m = self .bed . sid_count
222+ n = self .fam . sid_count
223+ m = self .bim . vid_count
100224 logging .info (f"Scanned plink with { n } samples and { m } variants" )
101225 dimensions = vcz .standard_dimensions (
102226 variants_size = m ,
@@ -119,7 +243,7 @@ def generate_schema(
119243 )
120244 # If we don't have SVLEN or END annotations, the rlen field is defined
121245 # as the length of the REF
122- max_len = self .bed .allele_2 .itemsize
246+ max_len = self .bim .allele_2 .itemsize
123247
124248 array_specs = [
125249 vcz .ZarrArraySpec (
@@ -156,7 +280,7 @@ def generate_schema(
156280 ),
157281 vcz .ZarrArraySpec (
158282 name = "variant_contig" ,
159- dtype = core .min_int_dtype (0 , len (np .unique (self .bed .chromosome ))),
283+ dtype = core .min_int_dtype (0 , len (np .unique (self .bim .chromosome ))),
160284 dimensions = ["variants" ],
161285 description = "Contig/chromosome index for each variant" ,
162286 ),
0 commit comments