-
Notifications
You must be signed in to change notification settings - Fork 0
/
pgenlib_internal.h
1483 lines (1273 loc) · 72.5 KB
/
pgenlib_internal.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#ifndef __PGENLIB_INTERNAL_H__
#define __PGENLIB_INTERNAL_H__
// This library is part of PLINK 2.00, copyright (C) 2005-2018 Shaun Purcell,
// Christopher Chang.
//
// This library is free software: you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License as published by the
// Free Software Foundation; either version 3 of the License, or (at your
// option) any later version.
//
// This library is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
// for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with this library. If not, see <http://www.gnu.org/licenses/>.
// Low-level C99/C++03/C++11 library for reading .pgen (PLINK 2.0 binary) files
// (designed to produce good lowest-common-denominator binaries across
// Windows/OS X/Linux).
//
// File format design:
// - With the header loaded, it is possible to efficiently access a variant by
// its index. Since records can now be variable-length, this sometimes
// requires storage of record lengths.
// - Due to the power of LD-based compression, we permit a variant record to
// just store a list of differences from an earlier, fully stored variant.
// However, only short-range dependence is permitted; sequential processing
// of the file only requires caching of the most recent explicitly stored
// variant.
// - Like the plink1 format, this is balanced for relatively easy reading and
// writing; in particular, the mode-0x10/0x11 header is not read-optimized,
// it passes up some obvious compression opportunities which would make it
// more difficult to write e.g. an efficient file merger. This isn't a big
// deal if we don't have a huge number of one-sample .pgen files sharing a
// single .bim file (or equivalent). (If they don't share the same .bim
// file, .bim overhead > .pgen overhead.) If we ever do, we can define an
// additional mode to handle that case more efficiently.
// - Building blocks are arrays of 1-bit, 2-bit, 4-bit, 1-byte, 2-byte, 3-byte,
// and 4-byte values. 3/5/6/7(/9...)-bit values don't play well with
// bitwise operations, and when it's important, there's usually a natural way
// to split them into power-of-2-bit components.
// (unsigned integers known to be smaller than 2^24, but not known to be
// smaller than 2^16, are stored as 3-byte values on disk and "decompressed"
// to uint32_t during loading.)
// - Missing value is usually all-1s. (Only exceptions right now: plink1
// backward compatibility mode; presence/absence of rare alts for variants
// with >2 alt alleles is an array of 1-bit values, where absence = 0; and
// presence/absence of phasing info is similar.) Time to move away from 01
// nonsense.
// - Individual variant records are prohibited from being >= 4GiB, to reduce
// integer overflow issues. (This may be reduced to 2GiB later, but I'll
// attempt to handle the 2-4GiB range properly for now since it's conceivable
// for multiallelic records in very large datasets to reach that size.)
// - (later todo: include stuff like file creation command in .pvar header;
// that doesn't really belong in a binary file.)
// Additional parameter conventions:
// - "quaterarr" indicates a word-aligned, packed array of 2-bit values, while
// "quatervec" is the vector-aligned equivalent. "nibblearr" marks the much
// rarer case of a packed array of 4-bit values.
// - "quatervec_01" indicates a packed, vector-aligned array of 2-bit values
// where each value is zero or one. This data structure was used quite a bit
// bit by plink 1.9 for operating on a subset of a 2-bit-genotype array.
// - "genovec" indicates a quatervec containing genotype information.
// - "interleaved_vec" is the plink 2.0 replacement for quatervec_01: we
// basically stack pairs of adjacent vectors on top of each other and unpack
// on the fly, since that tends to be faster than having to access twice as
// much memory.
#include "plink2_base.h"
// 10000 * major + 100 * minor + patch
// Exception to CONSTI32, since we want the preprocessor to have access to this
// value. Named with all caps as a consequence.
#define PGENLIB_INTERNAL_VERNUM 1007
#ifdef __cplusplus
namespace plink2 {
#endif
// other configuration-ish values needed by plink2_common subset
typedef unsigned char AlleleCode;
typedef uint16_t DoubleAlleleCode;
static_assert(sizeof(DoubleAlleleCode) == 2 * sizeof(AlleleCode), "Inconsistent AlleleCode and DoubleAlleleCode definitions.");
// Set this to 65534 if AlleleCode is uint16_t, 2^24 - 1 if uint32_t.
CONSTI32(kPglMaxAltAlleleCt, 254);
#define kMissingAlleleCode S_CAST(AlleleCode, -1)
CONSTI32(kAlleleCodesPerVec, kBytesPerVec / sizeof(AlleleCode));
// more verbose than (val + 3) / 4, but may as well make semantic meaning
// obvious; any explicit DivUp(val, 4) expressions should have a different
// meaning
// (not needed for bitct -> bytect, DivUp(val, CHAR_BIT) is clear enough)
HEADER_INLINE uintptr_t QuaterCtToByteCt(uintptr_t val) {
return DivUp(val, 4);
}
HEADER_INLINE uintptr_t QuaterCtToVecCt(uintptr_t val) {
return DivUp(val, kQuatersPerVec);
}
HEADER_INLINE uintptr_t QuaterCtToWordCt(uintptr_t val) {
return DivUp(val, kBitsPerWordD2);
}
HEADER_INLINE uintptr_t QuaterCtToAlignedWordCt(uintptr_t val) {
return kWordsPerVec * QuaterCtToVecCt(val);
}
HEADER_INLINE uintptr_t QuaterCtToCachelineCt(uintptr_t val) {
return DivUp(val, kQuatersPerCacheline);
}
HEADER_INLINE uintptr_t AlleleCodeCtToVecCt(uintptr_t val) {
return DivUp(val, kAlleleCodesPerVec);
}
HEADER_INLINE uintptr_t AlleleCodeCtToAlignedWordCt(uintptr_t val) {
return kWordsPerVec * AlleleCodeCtToVecCt(val);
}
HEADER_INLINE uintptr_t GetQuaterarrEntry(const uintptr_t* quaterarr, uint32_t idx) {
return (quaterarr[idx / kBitsPerWordD2] >> (2 * (idx % kBitsPerWordD2))) & 3;
}
// todo: check if this optimizes newval=0 out
HEADER_INLINE void AssignQuaterarrEntry(uint32_t idx, uintptr_t newval, uintptr_t* quaterarr) {
const uint32_t bit_shift_ct = 2 * (idx % kBitsPerWordD2);
uintptr_t* wordp = &(quaterarr[idx / kBitsPerWordD2]);
*wordp = ((*wordp) & (~((3 * k1LU) << bit_shift_ct))) | (newval << bit_shift_ct);
}
HEADER_INLINE void ClearQuaterarrEntry(uint32_t idx, uintptr_t* quaterarr) {
quaterarr[idx / kBitsPerWordD2] &= ~((3 * k1LU) << (idx % kBitsPerWordD2));
}
HEADER_INLINE uintptr_t GetNibbleArrEntry(const uintptr_t* nibblearr, uint32_t idx) {
return (nibblearr[idx / kBitsPerWordD4] >> (4 * (idx % kBitsPerWordD4))) & 15;
}
#ifdef USE_SSE42
HEADER_INLINE uint32_t Popcount01Word(uintptr_t val) {
return PopcountWord(val);
}
#else
HEADER_INLINE uint32_t Popcount01Word(uintptr_t val) {
return QuatersumWord(val);
}
#endif
// safe errstr_buf size for pgen_init_phase{1,2}()
CONSTI32(kPglErrstrBufBlen, kPglFnamesize + 256);
// assumes subset_mask has trailing zeroes up to the next vector boundary
void FillInterleavedMaskVec(const uintptr_t* __restrict subset_mask, uint32_t base_vec_ct, uintptr_t* interleaved_mask_vec);
HEADER_INLINE void CopyQuaterarr(const uintptr_t* __restrict source_quaterarr, uint32_t quaterarr_entry_ct, uintptr_t* __restrict target_quaterarr) {
memcpy(target_quaterarr, source_quaterarr, QuaterCtToWordCt(quaterarr_entry_ct) * kBytesPerWord);
}
// may want bit past the end of subset_mask (i.e. position
// raw_quaterarr_entry_ct) to always be allocated and unset. This removes the
// need for some explicit end-of-bitarray checks.
void CopyQuaterarrNonemptySubset(const uintptr_t* __restrict raw_quaterarr, const uintptr_t* __restrict subset_mask, uint32_t raw_quaterarr_entry_ct, uint32_t subset_entry_ct, uintptr_t* __restrict output_quaterarr);
// Copies a bit from raw_bitarr for each 01 entry in genovec.
void Copy01Subset(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict genovec, uint32_t write_bit_idx_start, uint32_t bit_ct, uintptr_t* __restrict output_bitarr);
void Copy10Subset(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict genovec, uint32_t bit_ct, uintptr_t* __restrict output_bitarr);
void GenovecCountFreqsUnsafe(const uintptr_t* genovec, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts);
void GenovecCountSubsetFreqs(const uintptr_t* __restrict genovec, const uintptr_t* __restrict sample_include_interleaved_vec, uint32_t raw_sample_ct, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts);
// slower GenovecCountSubsetFreqs() which does not require
// sample_include_interleaved_vec to be precomputed (and incidentally doesn't
// require vector alignment)
void GenoarrCountSubsetFreqs2(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict sample_include, uint32_t raw_sample_ct, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts);
void GenoarrCountSubsetIntersectFreqs(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict subset1, const uintptr_t* __restrict subset2, uint32_t raw_sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts);
void GenovecInvertUnsafe(uint32_t sample_ct, uintptr_t* genovec);
HEADER_INLINE uintptr_t InvertGenoWordUnsafe(uintptr_t geno_word) {
return (geno_word ^ ((~(geno_word << 1)) & kMaskAAAA));
}
// too easy to forget to multiply by 2
HEADER_INLINE void ZeroTrailingQuaters(uintptr_t quater_ct, uintptr_t* bitarr) {
ZeroTrailingBits(quater_ct * 2, bitarr);
}
HEADER_INLINE void SetTrailingQuaters(uintptr_t quater_ct, uintptr_t* bitarr) {
const uintptr_t trail_ct = quater_ct % kBitsPerWordD2;
if (trail_ct) {
bitarr[quater_ct / kBitsPerWordD2] |= (~k0LU) << (quater_ct * 2);
}
}
// A VINT is a sequence of bytes where each byte stores just 7 bits of an
// an integer, and the high bit is set when the integer has more nonzero bits.
// See e.g.
// https://developers.google.com/protocol-buffers/docs/encoding#varints
// (Note that protocol buffers used "group varints" at one point, but then
// abandoned them. I suspect they'd be simultaneously slower and less
// compact here.)
HEADER_INLINE unsigned char* Vint32Append(uint32_t uii, unsigned char* buf) {
while (uii > 127) {
*buf++ = (uii & 127) + 128;
uii >>= 7;
}
*buf++ = uii;
return buf;
}
// Returns 0x80000000U on read-past-end instead of UINT32_MAX so overflow check
// works properly in 32-bit build. Named "GetVint31" to make it more obvious
// that a 2^31 return value can't be legitimate.
HEADER_INLINE uint32_t GetVint31(const unsigned char* buf_end, const unsigned char** buf_iterp) {
if (buf_end > (*buf_iterp)) {
uint32_t vint32 = *((*buf_iterp)++);
if (vint32 <= 127) {
return vint32;
}
vint32 &= 127;
uint32_t shift = 7;
while (buf_end > (*buf_iterp)) {
uint32_t uii = *((*buf_iterp)++);
vint32 |= (uii & 127) << shift;
if (uii <= 127) {
return vint32;
}
shift += 7;
// currently don't check for shift >= 32 (that's what validate_vint31()
// is for).
}
}
return 0x80000000U;
}
// Input must be validated, or bufp must be >= 5 characters before the end of
// the read buffer. Currently unused.
// todo: check if this has enough of a speed advantage over GetVint31() to
// justify using this in the main loops and catching SIGSEGV. (seems to be no
// more than 3%?)
/*
HEADER_INLINE uint32_t GetVint31Unsafe(const unsigned char** buf_iterp) {
uint32_t vint32 = *(*buf_iterp)++;
if (vint32 <= 127) {
return vint32;
}
vint32 &= 127;
for (uint32_t shift = 7; shift < 32; shift += 7) {
uint32_t uii = *(*buf_iterp)++;
vint32 |= (uii & 127) << shift;
if (uii <= 127) {
return vint32;
}
}
return 0x80000000U;
}
*/
// Does not update buf_iter.
HEADER_INLINE uint32_t PeekVint31(const unsigned char* buf_iter, const unsigned char* buf_end) {
if (buf_end > buf_iter) {
uint32_t vint32 = *buf_iter++;
if (vint32 <= 127) {
return vint32;
}
vint32 &= 127;
uint32_t shift = 7;
while (buf_end > buf_iter) {
uint32_t uii = *buf_iter++;
vint32 |= (uii & 127) << shift;
if (uii <= 127) {
return vint32;
}
shift += 7;
}
}
return 0x80000000U;
}
/*
HEADER_INLINE uint32_t FGetVint31(FILE* ff) {
// Can't be used when multiple threads are reading from ff.
uint32_t vint32 = getc_unlocked(ff);
if (vint32 <= 127) {
return vint32;
}
vint32 &= 127;
for (uint32_t shift = 7; shift < 32; shift += 7) {
uint32_t uii = getc_unlocked(ff);
vint32 |= (uii & 127) << shift;
if (uii <= 127) {
return vint32;
}
}
return 0x80000000U;
}
HEADER_INLINE void FPutVint31(uint32_t uii, FILE* ff) {
// caller's responsibility to periodically check ferror
while (uii > 127) {
putc_unlocked((uii & 127) + 128, ff);
uii >>= 7;
}
putc_unlocked(uii, ff);
}
*/
// Need this for sparse multiallelic dosage.
HEADER_INLINE unsigned char* Vint64Append(uint64_t ullii, unsigned char* buf) {
while (ullii > 127) {
*buf++ = (ullii & 127) + 128;
ullii >>= 7;
}
*buf++ = ullii;
return buf;
}
// Returns 2^63 on read-past-end, and named GetVint63 to make it more obvious
// that a 2^63 return value can't be legitimate.
HEADER_INLINE uint64_t GetVint63(const unsigned char* buf_end, const unsigned char** buf_iterp) {
if (buf_end > (*buf_iterp)) {
uint64_t vint64 = *((*buf_iterp)++);
if (vint64 <= 127) {
return vint64;
}
vint64 &= 127;
uint32_t shift = 7;
while (buf_end > (*buf_iterp)) {
uint64_t ullii = *((*buf_iterp)++);
vint64 |= (ullii & 127) << shift;
if (ullii <= 127) {
return vint64;
}
shift += 7;
// currently don't check for shift >= 32 (that's what validate_vint31()
// is for).
}
}
return (1LLU << 63);
}
// main batch size
CONSTI32(kPglQuaterTransposeBatch, kQuatersPerCacheline);
// word width of each matrix row
CONSTI32(kPglQuaterTransposeWords, kWordsPerCacheline);
#ifdef USE_AVX2
CONSTI32(kPglQuaterTransposeBufbytes, kPglQuaterTransposeBatch);
void TransposeQuaterblockAvx2(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, void* __restrict buf0);
#else
CONSTI32(kPglQuaterTransposeBufbytes, (kPglQuaterTransposeBatch * kPglQuaterTransposeBatch) / 2);
void TransposeQuaterblockNoAvx2(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, unsigned char* __restrict buf0, unsigned char* __restrict buf1);
#endif
CONSTI32(kPglQuaterTransposeBufwords, kPglQuaterTransposeBufbytes / kBytesPerWord);
// up to 256x256; vecaligned_buf must have size 32k (256 bytes in AVX2 case)
// write_iter must be allocated up to at least
// RoundUpPow2(write_batch_size, 2) rows
HEADER_INLINE void TransposeQuaterblock(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* write_iter, VecW* vecaligned_buf) {
#ifdef USE_AVX2
// assert(!(write_ul_stride % 2));
TransposeQuaterblockAvx2(read_iter, read_ul_stride, write_ul_stride, read_batch_size, write_batch_size, write_iter, vecaligned_buf);
#else
TransposeQuaterblockNoAvx2(read_iter, read_ul_stride, write_ul_stride, read_batch_size, write_batch_size, write_iter, R_CAST(unsigned char*, vecaligned_buf), &(R_CAST(unsigned char*, vecaligned_buf)[kPglQuaterTransposeBufbytes / 2]));
#endif
}
// replaces each x with (32768 - x)
// okay for dosage_main to be nullptr if dosage_ct == 0
void BiallelicDosage16Invert(uint32_t dosage_ct, uint16_t* dosage_main);
// replaces each x with -x
void BiallelicDphase16Invert(uint32_t dphase_ct, int16_t* dphase_delta);
// currently does zero trailing halfword
void GenovecToMissingnessUnsafe(const uintptr_t* __restrict genovec, uint32_t sample_ct, uintptr_t* __restrict missingness);
// currently does not zero trailing halfword
void GenovecToNonmissingnessUnsafe(const uintptr_t* __restrict genovec, uint32_t sample_ct, uintptr_t* __restrict nonmissingness);
// N.B. assumes trailing bits of loadbuf have been filled with 1s, not 0s
// Also takes genoarr word count instead of sample count.
void SplitHomRef2hetUnsafeW(const uintptr_t* genoarr, uint32_t inword_ct, uintptr_t* __restrict hom_buf, uintptr_t* __restrict ref2het_buf);
void SplitHomRef2het(const uintptr_t* genoarr, uint32_t sample_ct, uintptr_t* __restrict hom_buf, uintptr_t* __restrict ref2het_buf);
// ----- end plink2_common subset -----
// other configuration-ish values
// this part of the specification is set in stone.
CONSTI32(kPglVblockSize, 65536);
// Currently chosen so that it plus kPglFwriteBlockSize + kCacheline - 2 is
// < 2^32, so DivUp(kPglMaxBytesPerVariant + kPglFwriteBlockSize - 1,
// kCacheline) doesn't overflow.
static const uint32_t kPglMaxBytesPerVariant = 0xfffdffc0U;
// CONSTI32(kPglMaxBytesPerDataTrack, 0x7ffff000);
// static_assert(kMaxBytesPerIO >= (int32_t)kPglMaxBytesPerDataTrack, "pgenlib_internal assumes a single variant data track always fits in one fread/fwrite operation.");
// mmap is a horrible idea for 32-bit builds, and as long as we have non-mmap
// code we may as well not worry about Win64 CreateFileMapping.
// also, OS X mmap implementation seems to be crappy for large sequentially
// accessed files, compared to Linux.
// possible todo: SIGBUS handling? do we ever want to try to recover from an
// I/O error?
#if defined(_WIN32) || !defined(__LP64__)
# define NO_MMAP
#endif
// currently must be power of 2, and multiple of (kBitsPerWord / 2)
CONSTI32(kPglDifflistGroupSize, 64);
FLAGSET_DEF_START()
kfPgenGlobal0,
kfPgenGlobalLdCompressionPresent = (1 << 0),
kfPgenGlobalDifflistOrLdPresent = (1 << 1),
// set opportunistically for now; may still be present if unset.
kfPgenGlobalMultiallelicHardcallFound = (1 << 2),
kfPgenGlobalHardcallPhasePresent = (1 << 3),
kfPgenGlobalDosagePresent = (1 << 4),
kfPgenGlobalDosagePhasePresent = (1 << 5),
kfPgenGlobalAllNonref = (1 << 6)
FLAGSET_DEF_END(PgenGlobalFlags);
FLAGSET_DEF_START()
kfPgrLdcache0,
kfPgrLdcacheQuater = (1 << 0),
kfPgrLdcacheDifflist = (1 << 1),
kfPgrLdcacheRawQuater = (1 << 2),
// may also want RawDifflist
kfPgrLdcacheBasicGenocounts = (1 << 3)
FLAGSET_DEF_END(PgrLdcacheFlags);
// difflist/LD compression should never involve more than
// raw_sample_ct / kPglMaxDifflistLenDivisor
// entries. (however, returned difflists can have up to twice as many entries,
// when a variant is LD-compressed and the reference variant is
// difflist-compressed.)
CONSTI32(kPglMaxDifflistLenDivisor, 8);
// threshold for using a deltalist to represent a bitarray on disk (currently
// relevant for dosage data)
CONSTI32(kPglMaxDeltalistLenDivisor, 9);
// The actual format:
// 1. 2 magic bytes 0x6c 0x1b.
//
// 2. Mode byte.
// 0x01 = plink1 variant-major.
// 0x02 = plink2 basic variant-major. variant/sample counts in header,
// 00 = hom ref, 01 = het, 10 = hom alt, 11 = missing. (vrtype 0)
// 0x03 = plink2 basic unphased dosage (vrtype 0x40)
// 0x04 = plink2 basic phased dosage (vrtype 0xc0)
// These are designed to be easy to write. Note that the dosage formats
// require hardcalls to be stored as well; however, you can just set them
// to all-missing and then use
// plink2 --hard-call-threshold [...] --make-pgen
// to populate them.
//
// 0x10 = variable-type and/or variable-length records present.
// 0x11 = mode 0x10, but with phase set information at the end of the
// file.
// larger values, and 0x05..0x0f, reserved for now.
//
// 3. If not plink1-format,
// a. 4-byte # of variants; call this M.
// b. 4-byte # of samples, call this N.
// c. Additional 1-byte header 'control' value (PgenHeaderCtrl). May be
// extended in the future.
// bits 0-3: Indicates vrtype and variant record length storage widths.
// If bit 3 is unset, bits 0-1 store (vrec_len_byte_ct - 1), while bit
// 2 is set iff phase or dosage info is present (requiring 8 bits
// instead of 4 bits for vrtypes).
// If bit 3 is set, a specialized encoding is used which combines the
// two pieces of information (reducing the overhead for files with few
// samples). The following encodings are currently defined:
// 1000: No difflist/LD/onebit compression, 2 bit
// (vrec_len - ceil(sample_ct / 4))) value. vrtype is zero if
// the entry is zero, and 8 (multiallelic-hardcall) if the record
// has 1-3 extra bytes. Designed for single-sample files sharing
// a single .bim-like file (note that if they don't share a .bim,
// .bim size will dominate).
// 1001: No difflist/LD/onebit compression, 4 bit
// (vrec_len - ceil(sample_ct / 4)) value. vrtype is zero if the
// entry is zero, and 8 if the record has 1-15 extra bytes.
// bits 4-5: allele count storage (00 = unstored, 01-11 = bytes per ct)
// bits 6-7: nonref flags info (00 = unstored, 01 = all ref/alt, 10 =
// never ref/alt, 11 = explicitly stored)
// Bits 0-5 do not apply to the fixed-length modes (currently 0x02-0x04)
// and should be zeroed out in that case.
//
// 4. If mode 0x10/0x11,
// a. Array of 8-byte fpos values for the first variant in each vblock.
// (Note that this suggests a way to support in-place insertions: some
// unused space can be left between the vblocks.)
// b. Sequence of header blocks, each containing information about
// kPglVblockSize variants (except the last may be shorter). All values
// are known-width, to allow e.g. plink2 --make-pgen/--pmerge to compress
// all variant records first, then fseek to the beginning of the output
// file and write the header.
// i. array of 4-bit or 1-byte vrtypes.
// ii. array of variant record lengths (each occupying vrec_len_byte_ct
// bytes, or 4 bits).
// iii. if bits 4-5 of {3c} aren't 00, array of alt allele counts.
// iv. nonref flags info, if explicitly stored
// (this representation allows more efficient random access)
// If mode 0x02-0x04, and nonref flags info explicitly stored, just that
// bitarray.
//
// 5. The variant records. See below for details.
// Difflist format:
// a. [difflist_len VINT]
// If difflist_len is zero, that's it. Otherwise, the difflist is organized
// into 64-element groups (the last group will usually be smaller), to make
// extraction of e.g. a single sample less painful. Note that with 20k
// samples, a difflist is space-saving even with MAF 5%:
// ~1/400 hom alt + ~38/400 het = (~39/400) * 20k
// = ~1950 sample IDs.
// that's 31 groups, requiring about 2 + 62 + 30 + 488 + 1919 = 2501 bytes
// (can be slightly higher since a few ID deltas may be larger than 127);
// uncompressed storage requires 5000 bytes.
// b. [array of group start sample IDs, each of sample_id_byte_ct]
// c. [array of 1-byte [delta segment lengths minus 63], with last entry
// omitted]
// d. [optional payload of fixed-width genotype values]
// (in retrospect, this should have been positioned after (e) and omitted
// from a lower-level packed-bitarray definition, oh well...)
// e. one "delta segment"/group: [array of [group size - 1] VINT values,
// each indicating the difference between the current and previous sample
// IDs; i.e. value is 1 for two consecutive samples]
// PgenFileInfo and PgenReader are the main exported "classes".
// Exported functions involving these data structure should all have
// "pgfi"/"pgr" in their names.
// Note that this can be default-copied.
struct PgenFileInfoStruct {
// ----- Header information, constant after initialization -----
uint32_t raw_variant_ct;
uint32_t raw_sample_ct;
// 0 if variant records aren't all the same length.
// If they are (e.g. PLINK 1 encoding; or vrtype bits 0-5 unset), we just
// fseek to
// const_fpos_offset + const_vrec_width * ((uint64_t)variant_idx).
uint64_t const_fpos_offset;
uint32_t const_vrec_width;
// see below. positioned here instead of slightly later due to struct
// packing behavior.
uint32_t const_vrtype; // 256 for plink 1 encoding, UINT32_MAX for nonconst
// size (raw_variant_ct + 1), so that the number of bytes of (zero-based)
// variant n is var_fpos[n+1] - var_fpos[n]. nullptr if
// const_vrec_width is nonzero.
// It's not difficult to save some memory here (e.g. unless we're dealing
// with >256 TB files, it's trivial to go from 8 bytes down to 6 bytes per
// entry), but I doubt that's worth the trouble; let's worry about
// O(mn)-or-worse stuff, and on-disk stuff, first.
uint64_t* var_fpos;
// representation type codes.
//
// bits 0-2:
// 000 = Simple 2-bit encoding.
// 100, 110, 111 = Simple difflist. Low two bits store the base value.
// (101 is currently reserved for future use since
// almost-all-ref/altx variants shouldn't really exist, thanks to
// Hardy-Weinberg equilibrium.)
// 010 = Differences-from-earlier-variant encoding ("LD compression"). The
// last variant without this type of encoding is the base.
// To simplify random access logic, the first variant in each vblock
// is prohibited from using this encoding.
// 011 = Inverted differences-from-earlier-variant encoding. (This covers
// the case where a reference allele is 'wrong'.) When decoding, the
// difflist should be processed first, then the entire genovec should
// be flipped.
// 001 = 1-bit + difflist representation. Suppose most calls are
// hom ref or het (e.g. a 20% MAF variant with ~4% hom alt1, ~36%
// het ref/alt1, ~64% hom ref), then the main datatrack has just the
// low bits of the usual 2-bit codes. This is followed by a difflist
// containing the hom alt1 and missing genotypes.
// The main datatrack is preceded by a single byte indicating what
// the two common values are: 2 low bits = [set value - unset value],
// next 2 bits = unset value (6 possibilities). Top 4 bits are
// reserved.
// bit 3: more than 1 alt allele? If yes, auxiliary data track #1
// disambiguates the 0b01 (ref/altx) and 0b10 (altx/alty, x may equal
// y) hardcalls. This contains a format byte, followed by a list of
// ref/altx patches, then a list of altx/alty patches. All unpatched
// genotypes are ref/alt1 or alt1/alt1.
// The bottom 4 bits of the format byte describe how the ref/altx
// patch set is stored.
// 0 = Starts with a bitarray with [total ref/altx count] bits (divide by 8
// and round up to get byte count of this component; any trailing bits
// in the last byte must be 0), where each set bit corresponds to
// presence of a rarealt (i.e. alt2/alt3/...).
// ExpandBytearr(aux1_first_quarter, all_01, raw_sample_ctl, ct_01, 0,
// patch_01);
// can be used to convert this into a set of sample IDs, though we may
// want to avoid an intermediate unpacking step in practice. Note that
// when we're passing in sample_ct < raw_sample_ct and the main
// datatrack is LD-compressed, we'd like ldbase_raw_genovec to be
// cached.
// This is followed by a packed array of fixed-width [allele idx - 2]
// values, where the width depends on the total number of alt alleles.
// 2 alts: width ZERO. All set bits in the first bitarray correspond
// to ref/alt2.
// 3 alts: width 1 bit. Set bits correspond to ref/alt3, clear bits
// correspond to ref/alt2.
// 4-5 alts: width 2 bits. 0b00 corresponds to ref/alt2, 0b01 =
// ref/alt3, 0b10 = ref/alt4, etc.
// 6-17 alts: width 4 bits.
// 18-257 alts: width 8 bits.
// 258-65537 alts: width 16 bits.
// 65538-16777215 alts: width 24 bits. Reasonable to prohibit more
// than 2^24 - 1 = 16777215, since variant records
// are limited to 4 GiB. I can imagine some
// applications of >65534 in highly variable
// regions, though, and it doesn't actually cost
// us anything to define a way to represent it.
// (A plink2 binary compiled with AlleleCode
// typedef'd as uint32_t will run more slowly, of
// course, but most binaries will not be compiled
// that way.)
// 1 = Same as mode 0, except the initial bitarray is replaced by a
// difflist with sample IDs. (We could make that piece somewhat
// smaller by storing 0-based ref/altx indexes instead, but I'm pretty
// sure that isn't worth the performance penalty of requiring all_01
// and more complicated unpacking. Though we'll need to peek at
// aux1[0] before decompressing the main datatrack to exploit this.)
// 15 = Null (no ref/altx exists in the maintrack)? Might remove this
// (storing this as mode 0 takes no more space, and mode 1 just takes
// 1 more byte), but it may enable some relevant performance
// optimizations.
// 2-14 are reserved for future use. We don't define an efficient way to
// represent a variant that e.g. has more alt2s than alt1s for now, since
// alt alleles will usually be sorted in order of decreasing frequency, but
// maybe this will matter in the future.
//
// The top 4 bits describe how the altx/alty patch set is stored. 0/1/15
// have the same meaning as they do for the ref/altx patch set; the only
// thing that changes is the format of the packed array of values at the
// end.
// 2 alts: width 1. This is treated as a special case. Set bits
// correspond to alt2/alt2, clear = alt1/alt2.
// 3-4 alts: width 2+2 bits. Each stores [allele idx - 1], with the
// smaller number in the lower bits. E.g. alt1/alt2 is stored as
// 0b0100; alt3/alt3 is stored as 0b1010.
// 5-16 alts: width 4+4 bits.
// 17-256 alts: width 8+8 bits.
// 257-65536 alts: width 16+16 bits.
// 65537-16777215 alts: width 24+24 bits.
//
// bit 4: hardcall phased? If yes, auxiliary data track #2 contains
// phasing information for heterozygous calls.
// The first *bit* of the track indicates whether an explicit
// "phasepresent" bitarray is stored. If it's set, the next het_ct
// bits are 1-bit values, where 0 = no phasing info known, and 1 =
// phasing info present. If it's unset, phasing info is present for
// every het call.
// This is followed by a "phaseinfo" bitarray, where 0 = unswapped,
// 1 = swapped (e.g. "1|0" in VCF).
// This track is normally unpacked into fixed-size bitarrays when
// loaded, but a raw mode is also provided (which doesn't support
// subsetting).
// By default, entire chromosomes/contigs are assumed to be phased
// together. (Todo: support contiguous phase sets.)
//
// bits 5-6:
// 00 = no dosage data.
// 01 = dosage list. Auxiliary data track #3 contains a delta-encoded list
// of sample IDs (like a difflist, but with no genotypes). Track #4
// contains a 16-bit (0..2^15; 65535 missing value is only permitted
// in unconditional-dosage case) value expressing the sum of all alt
// allele dosages. (Yes, making this the ref allele dosage would have
// been a bit cleaner, but it's too late now.)
// If the variant is multiallelic, nonzero alt2/alt3/... dosages are
// likely to be sparse. So,
// - track #5 contains a delta-encoded list describing which
// [sample_uidx x rarealt dosage] entries are nonzero, where rarealt
// index is in the lowest bits and sample_uidx can be computed via
// right-shift (to avoid integer-division headaches, especially
// important since indexes in this list can be larger than 2^32).
// We use sample_uidx here to make subsetting less painful.
// Since each nonzero dosage value requires 16 bits, and
// delta-encoding of a dense list adds less than 9 bits per entry,
// there isn't much point in supporting a dense bitarray mode here.
// To keep memory requirements sane for biobank-scale datasets when
// the number of alt alleles is very large, each sample is
// prohibited from having more than 255 nonzero allele dosages (this
// makes no difference when sizeof(AlleleCode) == 1, but it may
// matter later).
// - track #6 contains the rarealt nonzero dosage values.
// Note that this and the other dosage modes are in ADDITION to
// hardcalls. This increases filesize by up to 12.5%, but makes the
// reader substantially simpler; --hard-call-threshold logic is nicely
// compartmentalized.
// 10 = unconditional dosage (just track #4).
// 11 = dosage bitarray. In this case, auxiliary data track #3 contains an
// array of 1-bit values indicating which samples have dosages. If
// the variant is multiallelic, tracks #5 and 6 are as described
// above.
// bgen 1.2 format no longer permits fractional missingness, so no good
// reason for us to support it.
// Considered putting *all* dosage data at the end of the file (like I will
// do for phase set info); this could actually be worthwhile for
// unconditional dosages, but it doesn't work well when only some samples
// have dosage data.
// bit 7: some dosages explicitly phased? If yes, and dosages are not
// unconditionally present, auxiliary data track #7 is a bitarray of
// length dosage_ct indicating whether dphase_delta exists for that
// sample. Note that this is technically independent of bit 4; either
// can be set without the other. (However, in practice, bit 4 is
// almost always set when bit 7 is, since that enables more efficient
// storage of 0|0.99, 1|0.02, and similar pairs.)
// When phased dosages are present, track #8 contains values
// representing [(hap1 alt prob) - (hap2 alt prob)], etc., where the
// underlying values are represented in [0..16384] (so the signed
// difference is in [-16384..16384]). Track #4 contains the
// corresponding sums; parity does NOT need to match (necessary to
// allow this since we treat omitted values as zero; and since we are
// allowing it, no point in making parity match in other situations
// either). In fixed-width case, -32768 should be stored in track #8
// when the entire call is missing, while 0 and missing-phase are
// considered synonymous.
// In the biallelic case, if a hardcall is phased, a dosage is
// present, and no explicit dosage-phase is, we define it to mean the
// unique dphase_delta sequence with maximal absolute value, and
// --make-pgen takes advantage of it. This definition technically
// works for triallelic variants as well, but it breaks down with 4
// alleles, so we prohibit hardcall-phase + dosage + no-dosage-phase
// with more than 2 alleles.
// In the multiallelic case, tracks #9 and #10 are analogous to #5 and
// #6.
//
// Representation of variable ploidy (MT) was considered, but rejected since
// dosages should be at least as appropriate for MT.
// Oxford/VCF-style storage of separate probabilities for every possible
// genotype (e.g. P(AA), P(AB), P(BB) instead of just 2P(AA) + P(AB) and
// 2P(BB) + P(AB)) is tentatively rejected due to (i) lack of relevance to
// PLINK's analysis functions and (ii) high storage cost where we can afford
// it least. In principle, this is subject to reevaluation if (i) changes,
// but given the poor interaction with phased dosages, it's probably better
// to just think of them as permanently outside PLINK's scope.
//
//
// base pointer is null if mode is 0x01-0x04 (const_vrtype != UINT32_MAX).
// if not nullptr, required to be length >=
// max(raw_variant_ct + 1, RoundUpPow2(raw_variant_ct, kBytesPerWord))
unsigned char* vrtypes;
// alt allele counts.
// This can be nullptr if all alt allele counts are 1.
// (actually, we store the allele index offsets, so
// (allele_idx_offsets[n+1] - allele_idx_offsets[n]) is the number of alleles
// for variant n. Otherwise, we'd need another data structure to support
// fast allele name lookup.)
uintptr_t* allele_idx_offsets;
uintptr_t* nonref_flags;
// If pgr.nonref_flags is nullptr and kfPgenGlobalAllNonref is unset, all
// reference alleles are assumed to be correct.
PgenGlobalFlags gflags;
uint32_t max_allele_ct;
// uint32_t max_dosage_allele_ct; // might need this later
// * nullptr if using mmap
// * if using per-variant fread(), this is non-null during Pgen_file_info
// initialization, but it's then "moved" to the first Pgen_reader and set
// to nullptr.
FILE* shared_ff;
const unsigned char* block_base; // nullptr if using per-variant fread()
uint64_t block_offset; // 0 for mmap
#ifndef NO_MMAP
uint64_t file_size;
#endif
};
typedef struct PgenFileInfoStruct PgenFileInfo;
struct PgenReaderStruct {
NONCOPYABLE(PgenReaderStruct);
// would like to make this const, but that makes initialization really
// annoying in C99
struct PgenFileInfoStruct fi;
// ----- Mutable state -----
// If we don't fseek, what's the next variant we'd read? (Still relevant
// with mmap due to how LD decompression is implemented.)
uint32_t fp_vidx;
// ** per-variant fread()-only **
FILE* ff;
unsigned char* fread_buf;
// ** end per-variant fread()-only **
// if LD compression is present, cache the last non-LD-compressed variant
uint32_t ldbase_vidx;
// flags indicating which base_variant buffers are populated
PgrLdcacheFlags ldbase_stypes;
uint32_t ldbase_difflist_len;
// these should be treated as private after initial allocation.
// not currently guaranteed to have trailing zeroes.
uintptr_t* ldbase_raw_genovec; // now allocated even with no LD compression
uintptr_t* ldbase_genovec;
uintptr_t* ldbase_raregeno;
// when ldbase_difflist_ids[] is initialized, element [ldbase_difflist_len]
// must be set to sample_ct.
uint32_t* ldbase_difflist_sample_ids;
// common genotype can be looked up from vrtypes[]
STD_ARRAY_DECL(uint32_t, 4, ldbase_basic_genocounts);
// now only allocated if multiallelic variants, phase, and/or dosage present
// most commonly used for unsubsetted genovec; all_hets can be computed from
// this and patch_10_{set,vals}, and then aux2 can be interpreted.
// can also be used for other purposes after we're done processing aux2.
uintptr_t* workspace_vec;
// currently must hold (raw_sample_ct / kPglMaxDifflistLenDivisor)
// entries; may need to double the sizes later
// some top-level interface functions use these, so several lower-level
// functions cannot
uintptr_t* workspace_raregeno_vec;
uint32_t* workspace_difflist_sample_ids;
// must hold (raw_sample_ct / kPglMaxDifflistLenDivisor) entries
uintptr_t* workspace_raregeno_tmp_loadbuf;
uintptr_t* workspace_aux1x_present;
uintptr_t* workspace_all_hets;
uintptr_t* workspace_subset; // currently used for hphase decoding
uintptr_t* workspace_dosage_present;
uintptr_t* workspace_dphase_present;
// phase set loading (mode 0x11) unimplemented for now; should be a sequence
// of (sample ID, [uint32_t phase set begin, set end), [set begin, set end),
// ...).
};
typedef struct PgenReaderStruct PgenReader;
// might want this value to be typed...
CONSTI32(kPglVrtypePlink1, 256);
HEADER_INLINE uint32_t GetPgfiVrtype(const PgenFileInfo* pgfip, uint32_t vidx) {
if (pgfip->vrtypes) {
return pgfip->vrtypes[vidx];
}
return pgfip->const_vrtype;
}
HEADER_INLINE uint64_t GetPgfiFpos(const PgenFileInfo* pgfip, uintptr_t vidx) {
if (pgfip->var_fpos) {
return pgfip->var_fpos[vidx];
}
return pgfip->const_fpos_offset + pgfip->const_vrec_width * S_CAST(uint64_t, vidx);
}
HEADER_INLINE uint32_t GetPgfiVrecWidth(const PgenFileInfo* pgfip, uint32_t vidx) {
if (pgfip->var_fpos) {
return pgfip->var_fpos[vidx + 1] - pgfip->var_fpos[vidx];
}
return pgfip->const_vrec_width;
}
HEADER_INLINE uint32_t PgfiIsSimpleFormat(const PgenFileInfo* pgfip) {
return (pgfip->const_vrtype != UINT32_MAX);
}
HEADER_INLINE uint32_t VrtypeDifflist(uint32_t vrtype) {
return (vrtype & 4);
}
HEADER_INLINE uint32_t VrtypeLdCompressed(uint32_t vrtype) {
return (vrtype & 6) == 2;
}
// Only checks for rarealt-containing hardcall. Multiallelic dosage may still
// be present when this returns zero.
HEADER_INLINE uint32_t VrtypeMultiallelicHc(uint32_t vrtype) {
return (vrtype & 8);
}
HEADER_INLINE uint32_t VrtypeHphase(uint32_t vrtype) {
return (vrtype & 0x10);
}
HEADER_INLINE uint32_t VrtypeAuxTracksPresent(uint32_t vrtype) {
return (vrtype & 0x78);
}
HEADER_INLINE uint32_t VrtypeVariableWidth(uint32_t vrtype) {
return (vrtype & 0x3e);
}
HEADER_INLINE uint32_t VrtypeDosage(uint32_t vrtype) {
return (vrtype & 0x60);
}
static_assert(kPglMaxAltAlleleCt <= 254, "GetAux1xAlleleEntryByteCt() needs to be updated.");
HEADER_INLINE uintptr_t GetAux1aAlleleEntryByteCt(uint32_t allele_ct, uint32_t ct_01) {
assert(allele_ct >= 3);
if (allele_ct == 3) {
return 0;
}
if (allele_ct == 4) {
return DivUp(ct_01, 8);
}
if (allele_ct <= 6) {
return DivUp(ct_01, 4);
}
if (allele_ct <= 18) {
return DivUp(ct_01, 2);
}
return ct_01;
}
HEADER_INLINE uintptr_t GetAux1bAlleleEntryByteCt(uint32_t allele_ct, uint32_t ct_10) {
assert(allele_ct >= 3);
if (allele_ct == 3) {
return DivUp(ct_10, 1);
}
if (allele_ct < 6) {
return DivUp(ct_10, 4);
}
// one byte per entry for allele_ct <= 17, two bytes for 18..256
return ((allele_ct >= 18) + 1) * ct_10;
// todo: allele_ct > 257
}
// PgenFileInfo initialization is split into two phases, to decouple
// plink2's arena allocator from this library.
//
// Phase 1: Open the .pgen; verify that the initial bytes are consistent with
// the file format; load/verify sample and variant counts, initialize
// pgfi.const_vrtype, pgfi.const_vrec_width, and pgfi.const_fpos_offset;
// determine initial memory allocation requirement. first_alloc_cacheline_ct
// does not include allele counts and nonref flags, since it may be more
// appropriate to allocate those arrays earlier (during loading of a
// .bim-like file).
//
// pgfi.var_fpos is set to nullptr if pgfi.const_vrec_width is nonzero.
// pgfi.vrtypes/var_allele_cts are set to nullptr in the plink1-format case.
//
// raw_sample_ct and raw_variant_ct should be UINT32_MAX if not previously
// known.
//
// Intermission: Caller obtains a block of pgfi_alloc_cacheline_ct * 64 bytes,
// 64-byte aligned. The cachealigned_malloc() function can be used for this
// purpose. If necessary, pgfi.allele_idx_offsets and pgfi.nonref_flags
// should be pointed at already-loaded data, or allocated so they can be
// loaded during phase 2.
//
// Phase 2: Initialize most pointers in the PgenReader struct to appropriate
// positions in first_alloc. For modes 0x10-0x11, load pgfi.var_fpos and
// pgfi.vrtypes, load/validate pgfi.allele_idx_offsets and pgfi.nonref_flags
// if appropriate, and initialize pgfi.gflags, pgfi.max_allele_ct, and
// pgfi.max_dosage_allele_ct.
//
// Finally, if block-fread mode is being used, pgfi.block_base must be
// initialized to point to a memory large enough to handle the largest
// pgfi_block_read() operation that will be attempted.
// pgfi_blockload_get_cacheline_req() can be used to determine the necessary
// buffer size.
// This type may change if we introduce a more read-optimized format in the