-
Notifications
You must be signed in to change notification settings - Fork 24
/
idat2gtc.c
4344 lines (3969 loc) · 191 KB
/
idat2gtc.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* The MIT License
Copyright (c) 2024 Giulio Genovese
Author: Giulio Genovese <[email protected]>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// the code in this file reimplements functionalities and ideas present in:
// - AutoConvert (v1.6.3.1)
// - GTCtoVCF
// - BeadArrayFiles
// these resources were provided by Illumina without license restrictions
// the code in this file can be used as a replacement of the Illumina AutoCall software to convert IDAT intensity files
// into GTC genotype files for Infinium arrays which was implemented over time in different proprietary software:
// - AutoConvert (v1.6.3.1) - http://support.illumina.com/downloads/beeline_software_v10.html
// - AutoConvert 2.0 (v2.0.1.179) - http://support.illumina.com/downloads/beeline-software-2-0.html
// - IAAP CLI (v1.1) - http://support.illumina.com/downloads/iaap-genotyping-cli.html
// - Array Analysis CLI (v2.1) -
// http://support.illumina.com/downloads/illumina-microarray-analytics-array-analysis-cli-v2-installers.html
// the Illumina AutoCall software performs three main steps:
// - Normalization
// - Genotyping
// - Gender Estimation
// if AutoConvert and AutoConvert 2.0 are run without an input cluster file, only the normalization will be performed
// the normalization, clustering, and genotype calling functionalities of Illumina AutoCall were covered by the
// following patents:
// - http://patents.google.com/patent/US7035740 - covers normalization algorithm (2024-05-05)
// - http://patents.google.com/patent/US7467117 - divisional, covers clustering and genotyping (2024-03-24)
// - http://patents.google.com/patent/US20050216207 - same as US7035740
// - http://patents.google.com/patent/US20060224529 - same as US7467117
// GenCall GenTrain 2.0 uses the following algorithms:
// - Normalization algorithm (version 1.1.2)
// - Clustering algorithm (version 6.3.1)
// - Genotyping algorithm (version 6.3.0)
// GenCall GenTrain 3.0 uses the following algorithms:
// - Normalization algorithm version 1.2.0
// - Clustering algorithm version 7.0.0
// - Genotyping algorithm version 7.0.0
// the Illumina GenCall Source Code (http://support.illumina.com/downloads/gencall_software.html) includes:
// - NormalizationGoldenGate.cs - normalization routines (version 1.1.0)
// - NormalizationInfinium.cs - normalization routines (version 1.1.2)
// - GenTrain60.cs - clustering (version 6.3.1) and genotyping (6.3.0) routines
// - Utils.cs - closest points to axis, MATLAB robust fit, and other MATLAB routines
// the InfiniumIDATParser Java implementation of the normalization algorithm (version 1.1.2) by Jay Carey includes:
// - InfiniumIDATParser.java - IDAT parsing routines (2010-02-25)
// - InfiniumNormalization.java - normalization routines (version 1.1.2) (2010-01-07)
// - InfiniumUtils.java - closest points to axis, MATLAB robust fit, and other MATLAB routines (2010-01-08)
// this software was used in the 1000 Genomes project (Supplementary chapter 5.3 of http://doi.org/10.1038/nature15394)
// as part of the intensity rank sum test (IRS test) in the Genome STRiP software
// the differences between the normalization algorithm version 1.1.2 and version 1.2.0 are:
// - the original implementation of the madsigma function for robust line fitting is updated as it was updated in MATLAB
// - HandleScale will not use loci with missing data anymore for sub-bead pool bins with less than 192 loci
// - NormalizeSingleBinSingleChannel handles Infinium I (A/T and C/G) probes for sub-bead pool bins with less than 192
// loci
// for which version 1.1.2 would previously not attempt to compute a background intensity offset
// each AutoCall software determines gender in a slightly different way:
// - AutoConvert (v1.6.3.1) - only uses X chromosome heterozygosity and checks whether it is higher than 0.1
// - AutoConvert 2.0 (v2.0.1.179) - checks whether Y chromosome intensity R values are higher than 0.3 if autosomal call
// rate is higher than 0.97
// - IAAP CLI (v1.1) - same as above but there is a bug in the determination of the autosomal call rate that includes
// loci with null cluster scores as missing
// - Array Analysis CLI (v2.1) - same as above but with the bug removed
// we follow the approach of AutoConvert 2.0 and Array Analysis CLI as default and allow the user to use the approach of
// AutoConvert if requested for inexplicable reasons, AutoConvert 2.0, IAAP CLI, and Array Analysis CLI downsample to
// 10000 random autosomal loci to estimate the autosomal call rate this behavior can be suppressed by setting the
// autosomal call rate threshold from 0.97 to 0.0. However, this cannot be done with Array Analysis CLI
// to replicate the functionality for interoperability purposes, the following bugs were reimplemented:
// matlab_robustfit0 deviates from the original MATLAB implementation (statrobustfit) to match Illumina implementation
// (robustLineFit) when input option addconst/calcoffset is false by erroneously summing the vector into a scalar and
// causing the adjfactor variable to be always equal to 100.0 normalization IDs are allowed to overflow beyond 255,
// which happens with some probes in the Omni5 arrays, which can cause some Infinium I (G/C) probes to be normalized
// together with some Infinium II probes probe pairs with missing values are still used in the normalization step as
// probes with zero values the additional code included in GenTrain 3.0 in the Illumina implementation
// (NormalizeSingleBinSingleChannel) calls MATLAB function trimmean on an array where some values are artificially set
// to zero for no good reasons while other values are left out when determining scale_x with GenTrain 2.0 for
// normalization bins with less than 192 loci we include failed loci as AA loci
/****************************************
* LITERATURE MENTIONING NORMALIZATION *
****************************************/
// http://doi.org/10.1101/sqb.2003.68.69
// Fan,J.B. et al. (2003) Highly parallel SNP genotyping. Cold Spring Harb Symp Quant Biol, 68, 69–78
// first document that mentions GenCall and GenTrain
// http://patents.google.com/patent/US7035740
// Kermani 2005, Artificial intelligence and global normalization methods for genotyping
// explains how normalization works
// http://patents.google.com/patent/US7467117
// Kermani 2006, Artificial intelligence and global normalization methods for genotyping
// also explains how normaliation works(???)
// http://www.illumina.com/Documents/products/technotes/technote_gencall_data_analysis_software.pdf
// Illumina 2005, Illumina GenCall Data Analysis Software
// it does not describe the normalization but it refers to it
// http://doi.org/10.1016/j.mrfmmm.2004.07.022
// Shen 2005, High-throughput SNP genotyping on universal bead arrays
// introduces the GenTrain algorithm. It explains the GenScores are computed using fuzzy logic
// http://doi.org/10.1038/sj.ejhg.5201528;
// Moorhead et al. 2006, Optimal genotype determination in highly multiplexed SNP data
// in the supplement a normalization procedure very similar to Illumina's is proposed
// http://dnatech.genomecenter.ucdavis.edu/wp-content/uploads/2013/06/illumina_gt_normalization.pdf
// http://dnatech.genomecenter.ucdavis.edu/documents/illumina_gt_normalization.pdf
// Illumina 2006, Illumina’s Genotyping Data Normalization Methods
// has color versions of the patent figures with details that are missing from the patent including the use of 400
// homozygotes
// http://doi.org/10.1101/gr.5402306
// Peiffer et al. 2006, High-resolution genomic profiling of chromosomal aberrations using Infinium whole-genome
// genotyping explains Illumina normalization with minimum details
// http://www.illumina.com/documents/products/technotes/technote_cnv_algorithms.pdf
// Illumina 2007, DNA Copy Number and Loss of Heterozygosity Analysis Algorithms
// explains how LRR and BAF behave over CNVs
// http://doi.org/10.1093/bioinformatics/btm443
// Teo et al. 2007, A genotype calling algorithm for the Illumina BeadArray platform
// explains Illumina normalization with details that are missing from the patent including the use of 400 homozygotes
// (paper about Illuminus caller)
// http://doi.org/10.1101/gr.5686107
// Oosting et al. 2007, High-resolution copy number analysis of paraffin-embedded archival tissue using SNP BeadArrays
// explains an alternative normalization strategy
// http://doi.org/10.1101/gr.6861907
// Wang et al. 2007, PennCNV: An integrated hidden Markov model designed for high-resolution copy number variation
// detection in whole-genome SNP genotyping data explains Illumina normalization with minimum details
// http://doi.org/10.1093/bioinformatics/btn386
// Giannoulatou et al. 2008 GenoSNP: a variational Bayes within-sample SNP genotyping algorithm that does not require a
// reference population explains an alternative normalization strategy still based on beadpools (paper about GenoSNP
// caller)
// http://doi.org/10.1186/1471-2105-9-409
// Staaf et al. 2008 Normalization of Illumina Infinium whole-genome SNP data improves copy number estimates and allelic
// intensity ratios explains Illumina normalization with minimum details
// http://www.illumina.com/documents/products/technotes/technote_gentrain2.pdf
// Illumina 2009, Improved Cluster Generation with Gentrain2
// explains Gentrain 2.0
// http://doi.org/10.1093/bioinformatics/btp470
// Ritchie et al. 2009 R/Bioconductor software for Illumina’s Infinium whole-genome genotyping BeadChips
// explains an alternative normalization strategy
// http://doi.org/10.1093/nar/gkp552
// LaFramboise et al. 2009 Single nucleotide polymorphism arrays: a decade of biological, computational and
// technological advances explains Illumina normalization with minimum details but defines it as "The computational
// workhorse in the Illumina protocol"
// http://support.illumina.com/documents/products/technotes/technote_array_analysis_workflows.pdf
// Illumina 2011, Microarray Data Analysis Workflows
// explains how IDAT are converted to GTC with AutoCall
// http://doi.org/10.1186/1471-2105-12-68
// Ritchie et al. 2011 Comparing genotyping algorithms for Illumina’s Infinium whole-genome SNP BeadChips
// explains Illumina normalization with minimum details (paper comparing GenCall GenTrain 1.0, Infinium, GenoSNP, CRLMM)
// http://doi.org/10.1007/978-1-61779-555-8_29
// Teo 2011 Genotype Calling for the Illumina Platform
// explains Illumina normalization with details that are missing from the patent including the use of 400 homozygotes
// http://doi.org/10.1093/bioinformatics/bts47
// Goldstein et al. 2012 zCall: a rare variant caller for array-based genotyping
// uses Illumina normalization but no details provided
// http://doi.org/10.1093/bioinformatics/btr673
// Li et al. 2012, M3 : an improved SNP calling algorithm for Illumina BeadArray data
// explains Illumina normalization with minimum details (paper about M3 caller)
// http://doi.org/10.1093/bioinformatics/bts180
// Shah et al. 2012, optiCall: a robust genotype-calling algorithm for rare, low-frequency and common variants
// explains Illumina normalization with minimum details (paper about optiCall caller which uses Illumina normalization)
// http://doi.org/10.1093/bioinformatics/btu107
// Zhou et al. 2014, iCall: a genotype-calling algorithm for rare, low-frequency and common variants on the Illumina
// exome array paper about iCall which uses Illumina normalization
// http://web.stat.tamu.edu/sheather/PDF/WZhou_MSProject.pdf
// Zhou 2014, Segmentation-Based Detection of Mosaic Chromosomal Abnormality in Bladder Cancer Cells Using Whole Genome
// SNP Array includes explanation of the normalization following Illumina's technical note
// http://doi.org/10.1111/pbi.12183
// Wang,S. et al. (2014) Characterization of polyploid wheat genomic diversity using a high-density 90,000 single
// nucleotide polymorphism array. Plant Biotechnol J, 12, 787–796. introduces the polyploid clustering algorithm
// released by Illumina on 2013-10-07
// http://emea.illumina.com/content/dam/illumina-marketing/documents/products/technotes/gentrain3-technical-note-370-2016-015.pdf
// Illumina 2016, Improved Genotype Clustering with GenTrain 3.0
// explains that with less than 192 loci in a single normalization bin it will perform an affine normalization with two
// degrees of freedom rather than six
// http://www.illumina.com/content/dam/illumina/gcs/assembled-assets/marketing-literature/gentrain-tech-note-m-gl-01258/gentrain-tech-note-m-gl-01258.pdf
// Illumina 2023, Genotype clustering with GenTrain 3.0
// explains that with less than 192 loci in a single normalization bin it will perform an affine normalization with two
// degrees of freedom rather than six
#include <ctype.h>
#include <getopt.h>
#include <errno.h>
#include <time.h>
#include <dirent.h>
#include <math.h>
#include <float.h>
#include <htslib/hts.h>
#include <htslib/hfile.h>
#include <htslib/khash.h>
#include <htslib/ksort.h>
#include <htslib/khash_str2int.h>
#include "bcftools.h"
#define IDAT2GTC_VERSION "2024-09-27"
#define AUTOCALL_DATE_FORMAT_DFLT "%m/%d/%y %#I:%M %p" // equivalent to "MM/dd/yyyy h:mm tt"
#define AUTOCALL_VERSION_DFLT "3.0.0"
KSORT_INIT_GENERIC(float)
KSORT_INIT_GENERIC(int)
// void error(const char *format, ...)
//{
// va_list ap;
// va_start(ap, format);
// vfprintf(stderr, format, ap);
// va_end(ap);
// exit(-1);
// }
//
// static inline int iupac2bitmask(char iupac)
//{
// const int A = 1;
// const int C = 2;
// const int G = 4;
// const int T = 8;
// if ( iupac >= 97 ) iupac -= 32;
// if ( iupac == 'A' ) return A;
// if ( iupac == 'C' ) return C;
// if ( iupac == 'G' ) return G;
// if ( iupac == 'T' ) return T;
// if ( iupac == 'M' ) return A|C;
// if ( iupac == 'R' ) return A|G;
// if ( iupac == 'W' ) return A|T;
// if ( iupac == 'S' ) return C|G;
// if ( iupac == 'Y' ) return C|T;
// if ( iupac == 'K' ) return G|T;
// if ( iupac == 'V' ) return A|C|G;
// if ( iupac == 'H' ) return A|C|T;
// if ( iupac == 'D' ) return A|G|T;
// if ( iupac == 'B' ) return C|G|T;
// if ( iupac == 'N' ) return A|C|G|T;
// return -1;
// }
//
///**
// * mkdir_p() - create new directory for a file $fname
// * @fname: the file name to create the directory for, the part after last "/" is ignored
// */
// void mkdir_p(const char *fmt, ...)
//{
// va_list ap;
// va_start(ap, fmt);
// int n = vsnprintf(NULL, 0, fmt, ap) + 2;
// va_end(ap);
//
// char *path = (char*)malloc(n);
// va_start(ap, fmt);
// vsnprintf(path, n, fmt, ap);
// va_end(ap);
//
// char *tmp = strdup(path), *p = tmp+1;
// while (*p)
// {
// while (*p && *p!='/') p++;
// if ( !*p ) break;
// char ctmp = *p;
// *p = 0;
// int ret = mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
// if ( ret!=0 && errno!=EEXIST ) error("Error creating directory %s: %s\n", path,strerror(errno));
// *p = ctmp;
// while ( *p && *p=='/' ) p++;
// }
// free(tmp);
// free(path);
//}
/****************************************
* hFILE READING FUNCTIONS *
****************************************/
static inline ssize_t HTS_RESULT_USED md5_hread(hFILE *fp, void *buffer, size_t nbytes, hts_md5_context *md5) {
ssize_t ret = hread(fp, buffer, nbytes);
if (md5 && ret > 0) hts_md5_update(md5, buffer, ret);
return ret;
}
static inline int md5_hgetc(hFILE *fp, hts_md5_context *md5) {
int c = hgetc(fp);
if (md5 && c != EOF) hts_md5_update(md5, &c, 1);
return c;
}
// read or skip a fixed number of bytes
static void read_bytes(hFILE *hfile, void *buffer, size_t nbytes, hts_md5_context *md5) {
if (buffer) {
if (md5_hread(hfile, buffer, nbytes, md5) < nbytes) {
error("Failed to read %ld bytes from stream\n", nbytes);
}
} else {
int i, c = 0;
for (i = 0; i < nbytes; i++) c = md5_hgetc(hfile, md5);
if (c == EOF) error("Failed to reposition stream forward %ld bytes\n", nbytes);
}
}
// tests the end-of-file indicator for an hFILE
static int heof(hFILE *hfile) {
if (hgetc(hfile) == EOF) return 1;
hfile->begin--;
return 0;
}
// read or skip a fixed length array
static void read_array(hFILE *hfile, void **arr, size_t *m_arr, size_t nmemb, size_t size, size_t term,
hts_md5_context *md5) {
if (arr) {
if (!m_arr) {
*arr = malloc((nmemb + term) * size);
if (!*arr) error("Failed to allocate memory for array\n");
} else if (*m_arr < nmemb + term) {
void *tmp = realloc(*arr, (nmemb + term) * size);
if (!tmp) error("Failed to allocate memory for array\n");
*arr = tmp;
*m_arr = nmemb + term;
}
if (md5_hread(hfile, *arr, nmemb * size, md5) < nmemb * size) {
error("Failed to read %ld bytes from stream\n", nmemb * size);
}
} else {
int i, c = 0;
for (i = 0; i < nmemb * size; i++) c = md5_hgetc(hfile, md5);
if (c == EOF) error("Failed to reposition stream forward %ld bytes\n", nmemb * size);
}
}
// read or skip a length-prefixed string
// http://en.wikipedia.org/wiki/LEB128#Decode_unsigned_integer
static void read_pfx_string(hFILE *hfile, char **str, size_t *m_str, hts_md5_context *md5) {
uint8_t byte;
size_t n = 0, shift = 0;
while (1) {
if (md5_hread(hfile, (void *)&byte, 1, md5) < 1) {
error("Failed to read 1 byte from stream\n");
}
n |= (size_t)(byte & 0x7F) << shift;
if (!(byte & 0x80)) break;
shift += 7;
}
if (n || m_str) {
read_array(hfile, (void **)str, m_str, n, 1, 1, md5);
if (str) (*str)[n] = '\0';
}
}
// check whether file is compressed with gzip
static int is_gzip(hFILE *hfile) {
uint8_t buffer[2];
if (hpeek(hfile, (void *)buffer, 2) < 2) error("Failed to read 2 bytes from stream\n");
return (buffer[0] == 0x1f && buffer[1] == 0x8b);
}
static inline int hwrite_uint16(hFILE *hfile, uint16_t num) { return hwrite(hfile, &num, sizeof(uint16_t)); }
static inline int hwrite_int32(hFILE *hfile, int32_t num) { return hwrite(hfile, &num, sizeof(int32_t)); }
// http://en.wikipedia.org/wiki/LEB128#Encode_unsigned_integer
static int hwrite_pfx_string(hFILE *hfile, const char *str) {
if (!str) {
hputc(0, hfile);
return 0;
}
size_t n = strlen(str);
size_t value = n;
int ret = n;
do {
uint8_t byte = value & 0x7f;
value >>= 7;
if (value) byte ^= 0x80;
if (hputc(byte, hfile) == EOF) return -1;
ret++;
} while (value);
if (hwrite(hfile, str, n) < 0) return -1;
return ret;
}
/****************************************
* IDAT FILE IMPLEMENTATION *
****************************************/
// http://github.com/snewhouse/glu-genetics/blob/master/glu/lib/illumina.py
// http://github.com/HenrikBengtsson/illuminaio/blob/master/R/readIDAT.R
// /humgen/cnp04/sandbox/bobh/idat_parser/src/edu/mit/broad/gapcore/apps/infinium_idat_parser/InfiniumIDATParser.java
#define NUM_SNPS_READ 1000 // ID_N_CORES
// #define ... 100 // ID_BACKGROUNDS - not used
// #define ... 101 // ID_BACKGROUND_DEVS - not used
#define ILLUMINA_ID 102 // ID_BEAD_TYPES
#define SD 103 // ID_DEVS
#define MEAN 104 // ID_MEANS
// #define ... 105 // ID_MEDIANS - not used
// #define ... 106 // ID_N_BEADS - not used
#define NBEADS 107 // ID_N_GOOD_BEADS
// #define ... 108 // ID_TRIMMED_MEANS - not used
#define MID_BLOCK 200 // ID_ILLUMICODES
#define RUN_INFO 300 // ID_PROCESS_HISTORY
#define RED_GREEN 400 // ID_TENTH_PERCENTILE
#define IDAT_SNP_MANIFEST 401 // ID_SAMPLE_BEADSET
#define SENTRIX_BARCODE 402 // ID_BARCODE
#define CHIP_TYPE 403 // ID_SENTRIX_FORMAT
#define SENTRIX_POSITION 404 // ID_SECTION_LABEL
#define BEADSET 405 // ID_BEADSET
#define IDAT_SAMPLE_NAME 406 // ID_DNA
#define DESCRIPTION 407 // ID_OPA
#define IDAT_SAMPLE_PLATE 408 // ID_DNA_PLATE
#define IDAT_SAMPLE_WELL 409 // ID_WELL
#define IDAT_SAMPLE_COUNT 410 // ID_SAMPLE_COUNT
// #define ... 411 // ID_DX - not used
#define IDAT_VLN 510 // ID_VLN
typedef struct {
const char *chip_type;
int num_snps;
int num_mid_blocks;
const char *chip_type_guess;
} chip_type_t;
static chip_type_t chip_types[] = {
{"1-95um_multi-swath_for_4x5M", 4568350, 4568350, "HumanOmni5-4-v1-0"},
{"1-95um_multi-swath_for_4x5M", 4640213, 4640213, "HumanOmni5-4v1-1"},
{"1-95um_multi-swath_for_4x5M", 4685673, 4685673, "InfiniumOmni5-4v1-2"},
{"1-95um_multi-swath_for_4x5M", 4696316, 4696316, "HumanOmni5-4-v1-0"},
{"1-95um_multi-swath_for_8x2-5M", 2266191, 2266191, "Multi-EthnicGlobal"},
{"1-95um_multi-swath_for_8x2-5M", 2266367, 2266367, "Multi-EthnicGlobal"},
{"1-95um_multi-swath_for_8x2-5M", 2266404, 2266404, "Multi-EthnicGlobal"},
{"1-95um_multi-swath_for_8x2-5M", 2266406, 2266406, "Multi-EthnicGlobal"},
{"1-95um_multi-swath_for_8x2-5M", 2268676, 2268676, "MEGAEx_BioVU_15075710"},
{"1-95um_multi-swath_for_8x2-5M", 2315574, 2315574, "Multi-EthnicGlobal"},
{"1-95um_multi-swath_for_8x2-5M", 2389000, 2389000, "CCPMBiobankMEGA2_20002558X345183"},
{"1-95um_multi-swath_for_8x2-5M", 2508689, 2508689, "GDA-8v1-0"},
{"1-95um_multi-swath_for_8x2-5M", 2550870, 2550870, "HumanOmni2.5-8v1"},
{"1-95um_multi-swath_for_8x2-5M", 2563064, 2563064, "HumanOmni25M-8v1-1"},
{"1-95um_multi-swath_for_8x2-5M", 2575219, 2575219, "HumanOmni2.5-8v1"},
{"1-95um_multi-swath_for_8x2-5M", 2605775, 2605775, "HumanOmni25M-8v1-1"},
{"BeadChip 12x1", 55300, 55300, "humanmethylation27_270596_v1-2 ???"},
{"BeadChip 12x1Q", 191668, 191668, "CanineHD"},
{"BeadChip 12x1Q", 299260, 299260, "HumanCytoSNP-12v2-1"},
{"BeadChip 12x8", 301084, 301084, "HumanCore-12v1-0"},
{"BeadChip 12x8", 304138, 304138, "HumanExome-12v1-1"},
{"BeadChip 12x8", 567727, 567727, "HumanCoreExome-12-v1-0"},
{"BeadChip 12x8", 569060, 569060, "HumanCoreExome-12-v1-0"},
{"BeadChip 12x8", 573012, 573012, "HumanCoreExome-12-v1-1"},
{"BeadChip 12x8", 576769, 576769, "HumanCoreExome-12-v1-1"},
{"BeadChip 12x8", 622399, 622399, "humanmethylation450_15017482_v-1-2 ???"},
{"BeadChip 12x8", 722405, 722405, "HumanOmniExpress-12-v1-1"},
{"BeadChip 12x8", 734889, 734889, "HumanOmniExpress-12-v1-0"},
{"BeadChip 12x8", 736136, 736136, "HumanOmniExpress-12-v1-0"},
{"BeadChip 1x12", 577085, 8627, "HumanHap550v3"},
{"BeadChip 1x12", 661182, 49163, "HumanHap650Yv3"},
{"BeadChip 1x40", 1129736, 57373, "Human1Mv1"},
{"BeadChip 1x40 66", 1078890, 52497, "Human1Mv1"},
{"BeadChip 24x1x4", 306776, 306776, "InfiniumCore-24v1-2"},
{"BeadChip 24x1x4", 527136, 527136, "OncoArray-500K"},
{"BeadChip 24x1x4", 577781, 577781, "HumanCoreExome-24v1-0"},
{"BeadChip 24x1x4", 581261, 581261, "HumanCoreExome-24v1-2"},
{"BeadChip 24x1x4", 582684, 582684, "HumanCoreExome-24v1-1"},
{"BeadChip 24x1x4", 611866, 611866, "HumanCoreExome-24v1-4"},
{"BeadChip 24x1x4", 623302, 623302, "PsychChip_15048346"},
{"BeadChip 24x1x4", 623513, 623513, "InfiniumPsychArray-24v1-1"},
{"BeadChip 24x1x4", 638714, 638714, "PsychChip_v1-1_15073391"},
{"BeadChip 24x1x4", 647864, 647864, "InfiniumPsychArray-24v1-3"},
{"BeadChip 24x1x4", 663209, 663209, "GSA-24v1-0"},
{"BeadChip 24x1x4", 704215, 704215, "GSA-24v3-0"},
{"BeadChip 24x1x4", 708013, 708013, "DeCodeGenetics_V1_20012591"},
{"BeadChip 24x1x4", 710576, 710576, "GSAMD-24v1-0_20011747"},
{"BeadChip 24x1x4", 710606, 710606, "GSAMD-24v1-0_20011747"},
{"BeadChip 24x1x4", 710608, 710608, "GSAMD-24v1-0_20011747"},
{"BeadChip 24x1x4", 715653, 715653, "HumanOmniExpress-24v1-1"},
{"BeadChip 24x1x4", 716279, 716279, "InfiniumOmniExpress-24v1-2"},
{"BeadChip 24x1x4", 718963, 718963, "HumanOmniExpress-24-v1-0"},
{"BeadChip 24x1x4", 719234, 719234, "HumanOmniExpress-24-v1-0"},
{"BeadChip 24x1x4", 729110, 729110, "ASA-24v1-0"},
{"BeadChip 24x1x4", 733354, 733354, "GSA-24v2-0"},
{"BeadChip 24x1x4", 749019, 749019, "DeCodeGenetics_V3_20032937X331991"},
{"BeadChip 24x1x4", 751614, 751614, "GSAMD-24v3-0-EA_20034606"},
{"BeadChip 24x1x4", 766804, 766804, "JSA-24v1-0"},
{"BeadChip 24x1x4", 776509, 776509, "ASA-24v1-0"},
{"BeadChip 24x1x4", 780343, 780343, "GSAMD-24v2-0_20024620"},
{"BeadChip 24x1x4", 780509, 780509, "GSAMD-24v2-0_20024620"},
{"BeadChip 24x1x4", 818205, 818205, "GSA-24v2-0"},
{"BeadChip 2x10", 321354, 37161, "HumanHap300v2"},
{"BeadChip 2x12", 381079, 29275, "HumanCNV370v1"},
{"BeadChip 2x20", 561686, 54936, "HumanHap550v3"},
{"BeadChip 2x6Q", 1224000, 180026, "Human1M-Duov3"},
{"BeadChip 2x6Q", 1224629, 180026, "Human1M-Duov3"},
{"BeadChip 48x4", 730546, 730546, "GSA-MD-48v4-0_20098041"},
{"BeadChip 4x10", 2623923, 1300482, "HumanOmni2.5-4v1"},
{"BeadChip 4x10", 2623923, 1323441, "HumanOmni2.5-4v1"},
{"BeadChip 4x10", 2624666, 1300941, "HumanOmni2.5-4v1"},
{"BeadChip 4x10", 2624666, 1323725, "HumanOmni2.5-4v1"},
{"BeadChip 4x10", 2624671, 1323726, "HumanOmni2.5-4v1"},
{"BeadChip 4x10", 2655594, 1354653, "HumanOmni2.5-4v1"},
{"BeadChip 4X1X14", 1186430, 1186430, "HumanOmni1-Quad_v1-0"},
{"BeadChip 4x2Q", 376216, 186490, "HumanCNV370-Quadv3"},
{"BeadChip 4x3Q", 626122, 208778, "Human610-Quadv1"},
{"BeadChip 4x3Q", 667447, 208778, "Human660W-Quad_v1"},
{"BeadChip 8x5", 1052641, 1052641, "infinium-methylationepic-v-1-0 ???"},
{"BeadChip 8x5", 867478, 867478, "CytoSNP-850K"},
{"BeadChip 8x5", 988240, 988240, "HumanOmniExpressExome-8-v1-1"},
{"BeadChip 8x5", 989536, 989536, "HumanOmniExpressExome-8-v1-1"},
{"BeadChip 8x5", 992824, 992824, "HumanOmniExpressExome-8-v1-4"},
{"BeadChip 8x5", 996003, 996003, "HumanOmniExpressExome-8-v1-2"},
{"BeadChip 8x5", 996055, 996055, "HumanOmniExpressExome-8-v1-2"},
{"SLIDE.15028542.24x1x3", 307984, 307984, "HumanCore-24v1-0"},
{"SLIDE.15028542.24x1x3", 311460, 311460, "HumanCore-24v1-0"},
{NULL, 0, 0, NULL}};
typedef struct {
char *run_time;
char *block_type;
char *block_pars;
char *block_code;
char *code_version;
} RunInfo;
typedef struct {
char *fn;
hFILE *hfile;
int64_t version;
int32_t number_toc_entries;
uint16_t *id;
int64_t *toc;
int32_t num_snps;
int32_t num_mid_blocks;
int32_t *ilmn_id;
uint16_t *sd;
uint16_t *mean;
uint8_t *nbeads;
const uint16_t *trimmed_mean; // only used for historical purposes
uint8_t *mid_block;
uint8_t red_green[4];
char *snp_manifest;
char *sentrix_barcode;
char *chip_type;
char *sentrix_position;
char *beadset;
char *sample_name;
char *description;
char *sample_plate;
char *sample_well;
int32_t sample_count;
char *vln;
RunInfo *run_infos;
int32_t m_run_infos;
const char *chip_type_guess;
const char *imaging_date;
const char *scanner_data;
void *ilmn_id2index;
} idat_t;
KHASH_MAP_INIT_INT(32, int32_t)
static int idat_read(idat_t *idat, uint16_t id) {
int i;
for (i = 0; i < idat->number_toc_entries && id != idat->id[i]; i++);
if (i == idat->number_toc_entries) return -1;
if (hseek(idat->hfile, idat->toc[i], SEEK_SET) < 0)
error("Fail to seek to position %ld in IDAT %s file\n", idat->toc[i], idat->fn);
switch (id) {
case NUM_SNPS_READ:
read_bytes(idat->hfile, (void *)&idat->num_snps, sizeof(int32_t), NULL);
break;
case ILLUMINA_ID:
idat->ilmn_id = (int32_t *)malloc(idat->num_snps * sizeof(int32_t));
read_bytes(idat->hfile, (void *)idat->ilmn_id, idat->num_snps * sizeof(int32_t), NULL);
int ret;
idat->ilmn_id2index = kh_init(32);
khash_t(32) *hash = (khash_t(32) *)idat->ilmn_id2index;
for (i = 0; i < idat->num_snps; i++) {
khiter_t k = kh_put(32, hash, idat->ilmn_id[i], &ret);
if (ret < 0) error("Unable to insert Illumina ID %d in hash table\n", idat->ilmn_id[i]);
if (ret > 0)
kh_val(hash, k) = kh_size(hash) - 1;
else
error("Duplicate Illumina ID %d in hash table\n", idat->ilmn_id[i]);
}
break;
case SD:
idat->sd = (uint16_t *)malloc(idat->num_snps * sizeof(uint16_t));
read_bytes(idat->hfile, (void *)idat->sd, idat->num_snps * sizeof(uint16_t), NULL);
break;
case MEAN:
idat->mean = (uint16_t *)malloc(idat->num_snps * sizeof(uint16_t));
read_bytes(idat->hfile, (void *)idat->mean, idat->num_snps * sizeof(uint16_t), NULL);
idat->trimmed_mean = idat->mean;
break;
case NBEADS:
idat->nbeads = (uint8_t *)malloc(idat->num_snps * sizeof(uint8_t));
read_bytes(idat->hfile, (void *)idat->nbeads, idat->num_snps * sizeof(uint8_t), NULL);
break;
case MID_BLOCK:
read_bytes(idat->hfile, (void *)&idat->num_mid_blocks, sizeof(int32_t), NULL);
idat->mid_block = (uint8_t *)malloc(idat->num_mid_blocks * sizeof(uint8_t));
read_bytes(idat->hfile, (void *)idat->mid_block, idat->num_mid_blocks * sizeof(uint8_t), NULL);
break;
case RED_GREEN:
read_bytes(idat->hfile, (void *)&idat->red_green, 4 * sizeof(uint8_t), NULL);
break;
case IDAT_SNP_MANIFEST:
read_pfx_string(idat->hfile, &idat->snp_manifest, NULL, NULL);
break;
case SENTRIX_BARCODE:
read_pfx_string(idat->hfile, &idat->sentrix_barcode, NULL, NULL);
break;
case CHIP_TYPE:
read_pfx_string(idat->hfile, &idat->chip_type, NULL, NULL);
break;
case SENTRIX_POSITION:
read_pfx_string(idat->hfile, &idat->sentrix_position, NULL, NULL);
break;
case BEADSET:
read_pfx_string(idat->hfile, &idat->beadset, NULL, NULL);
break;
case IDAT_SAMPLE_NAME:
read_pfx_string(idat->hfile, &idat->sample_name, NULL, NULL);
break;
case DESCRIPTION:
read_pfx_string(idat->hfile, &idat->description, NULL, NULL);
break;
case IDAT_SAMPLE_PLATE:
read_pfx_string(idat->hfile, &idat->sample_plate, NULL, NULL);
break;
case IDAT_SAMPLE_WELL:
read_pfx_string(idat->hfile, &idat->sample_well, NULL, NULL);
break;
case IDAT_SAMPLE_COUNT:
read_bytes(idat->hfile, (void *)&idat->sample_count, sizeof(int32_t), NULL);
break;
case IDAT_VLN:
read_pfx_string(idat->hfile, &idat->vln, NULL, NULL);
break;
case RUN_INFO:
read_bytes(idat->hfile, (void *)&idat->m_run_infos, sizeof(int32_t), NULL);
idat->run_infos = (RunInfo *)malloc(idat->m_run_infos * sizeof(RunInfo));
for (i = 0; i < idat->m_run_infos; i++) {
read_pfx_string(idat->hfile, &idat->run_infos[i].run_time, NULL, NULL);
read_pfx_string(idat->hfile, &idat->run_infos[i].block_type, NULL, NULL);
read_pfx_string(idat->hfile, &idat->run_infos[i].block_pars, NULL, NULL);
read_pfx_string(idat->hfile, &idat->run_infos[i].block_code, NULL, NULL);
read_pfx_string(idat->hfile, &idat->run_infos[i].code_version, NULL, NULL);
}
break;
default:
error("IDAT file format does not support TOC entry %d\n", id);
break;
}
return 0;
}
static idat_t *idat_init(const char *fn, int load_arrays) {
idat_t *idat = (idat_t *)calloc(1, sizeof(idat_t));
idat->fn = strdup(fn);
idat->hfile = hopen(idat->fn, "rb");
if (idat->hfile == NULL) error("Could not open %s: %s\n", idat->fn, strerror(errno));
if (is_gzip(idat->hfile)) error("File %s is gzip compressed and currently cannot be sought\n", idat->fn);
int i;
uint8_t buffer[4];
if (hread(idat->hfile, (void *)buffer, 4) < 4) error("Failed to read magic number from %s file\n", idat->fn);
if (memcmp(buffer, "IDAT", 4) != 0) error("IDAT file %s format identifier is bad\n", idat->fn);
read_bytes(idat->hfile, (void *)&idat->version, sizeof(int64_t), NULL);
if (idat->version < 3)
error("Cannot read IDAT file %s. Unsupported IDAT file format version: %ld\n", idat->fn, idat->version);
read_bytes(idat->hfile, (void *)&idat->number_toc_entries, sizeof(int32_t), NULL);
idat->id = (uint16_t *)malloc(idat->number_toc_entries * sizeof(uint16_t));
idat->toc = (int64_t *)malloc(idat->number_toc_entries * sizeof(int64_t));
for (i = 0; i < idat->number_toc_entries; i++) {
read_bytes(idat->hfile, (void *)&idat->id[i], sizeof(uint16_t), NULL);
read_bytes(idat->hfile, (void *)&idat->toc[i], sizeof(int64_t), NULL);
}
for (i = 0; i < idat->number_toc_entries; i++) {
if (!load_arrays && idat->id[i] <= MID_BLOCK) {
if (idat->id[i] == MID_BLOCK) {
if (hseek(idat->hfile, idat->toc[i], SEEK_SET) < 0)
error("Fail to seek to position %ld in IDAT %s file\n", idat->toc[i], idat->fn);
read_bytes(idat->hfile, (void *)&idat->num_mid_blocks, sizeof(int32_t), NULL);
}
continue;
}
idat_read(idat, idat->id[i]);
}
if (idat->chip_type) {
const chip_type_t *ptr;
for (ptr = chip_types; ptr->chip_type; ptr++) {
if (strcmp(idat->chip_type, ptr->chip_type) == 0 && ptr->num_snps == idat->num_snps
&& ptr->num_mid_blocks == idat->num_mid_blocks)
idat->chip_type_guess = ptr->chip_type_guess;
}
}
for (i = 0; i < idat->m_run_infos; i++) {
if (strcmp(idat->run_infos[i].block_type, "Scan") != 0) continue;
idat->imaging_date = idat->run_infos[i].run_time;
idat->scanner_data = idat->run_infos[i].block_pars;
}
return idat;
}
static void idat_destroy(idat_t *idat) {
if (!idat) return;
if (hclose(idat->hfile) < 0) error("Error closing IDAT file %s\n", idat->fn);
free(idat->fn);
free(idat->id);
free(idat->toc);
free(idat->snp_manifest);
free(idat->sentrix_barcode);
free(idat->chip_type);
free(idat->sentrix_position);
free(idat->beadset);
free(idat->sample_name);
free(idat->description);
free(idat->sample_plate);
free(idat->sample_well);
free(idat->vln);
int i;
for (i = 0; i < idat->m_run_infos; i++) {
free(idat->run_infos[i].run_time);
free(idat->run_infos[i].block_type);
free(idat->run_infos[i].block_pars);
free(idat->run_infos[i].block_code);
free(idat->run_infos[i].code_version);
}
free(idat->run_infos);
free(idat->ilmn_id);
free(idat->sd);
free(idat->mean);
free(idat->nbeads);
free(idat->mid_block);
if (idat->ilmn_id2index) kh_destroy(32, idat->ilmn_id2index);
free(idat);
}
static void idat_to_csv(const idat_t *idat, FILE *stream, int verbose) {
int i;
fprintf(stream, "Illumina, Inc.\n");
fprintf(stream, "[Heading]\n");
fprintf(stream, "Descriptor File Name,%s\n", strrchr(idat->fn, '/') ? strrchr(idat->fn, '/') + 1 : idat->fn);
fprintf(stream, "IDAT file version,%ld\n", idat->version);
fprintf(stream, "Number of TOC entries,%d\n", idat->number_toc_entries);
fprintf(stream, "Probes Count,%d\n", idat->num_snps);
fprintf(stream, "Mid Blocks Count,%d\n", idat->num_mid_blocks);
fprintf(stream, "Red Green,%02x %02x %02x %02x\n", idat->red_green[0], idat->red_green[1], idat->red_green[2],
idat->red_green[3]);
fprintf(stream, "SNP Manifest,%s\n", idat->snp_manifest ? idat->snp_manifest : "");
fprintf(stream, "Sentrix Barcode,%s\n", idat->sentrix_barcode);
fprintf(stream, "Chip Type,%s\n", idat->chip_type);
fprintf(stream, "Sentrix Position,%s\n", idat->sentrix_position);
fprintf(stream, "BeadSet,%s\n", idat->beadset ? idat->beadset : "");
fprintf(stream, "Sample Name,%s\n", idat->sample_name ? idat->sample_name : "");
fprintf(stream, "Description,%s\n", idat->description ? idat->description : "");
fprintf(stream, "Sample Plate,%s\n", idat->sample_plate ? idat->sample_plate : "");
fprintf(stream, "Sample Well,%s\n", idat->sample_well ? idat->sample_well : "");
fprintf(stream, "Sample Count,%d\n", idat->sample_count);
fprintf(stream, "Vln,%s\n", idat->vln ? idat->vln : "");
fprintf(stream, "Chip Prefix (Guess),%s\n", idat->chip_type_guess ? idat->chip_type_guess : "Unknown");
fprintf(stream, "[Assay]\n");
fprintf(stream, "IlmnID,Sd,Mean,Nbeads\n");
if (verbose) {
for (i = 0; i < idat->num_snps; i++)
fprintf(stream, "%d,%d,%d,%d\n", idat->ilmn_id[i], idat->sd[i], idat->mean[i], idat->nbeads[i]);
fprintf(stream, "[Mid Blocks]\n");
for (i = 0; i < idat->num_mid_blocks; i++) fprintf(stream, "%d\n", idat->mid_block[i]);
} else {
fprintf(stream, "... use --verbose to visualize Assay data ...\n");
fprintf(stream, "[Mid Blocks]\n");
fprintf(stream, "... use --verbose to visualize Mid Blocks data ...\n");
}
fprintf(stream, "[Run Infos]\n");
for (i = 0; i < idat->m_run_infos; i++) {
fprintf(stream, "%s\t%s\t%s\t%s\t%s\n", idat->run_infos[i].run_time, idat->run_infos[i].block_type,
idat->run_infos[i].block_pars, idat->run_infos[i].block_code, idat->run_infos[i].code_version);
}
}
static void idats_to_tsv(idat_t **idats, int n, FILE *stream) {
fprintf(stream,
"idat\tnumber_probes\tnumber_mid_blocks\tred_green\tmanifest_file\tsentrix_"
"barcode\tchip_type\t"
"sentrix_position\tbeadset\tsample_name\tdescription\tsample_plate\tsample_"
"well\tsample_count\tvln\t"
"chip_type_guess\tscan_date\tscanner_data\n");
int i;
for (i = 0; i < n; i++) {
idat_t *idat = idats[i];
fprintf(stream,
"%s\t%d\t%d\t%02x %02x %02x "
"%02x\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\t%s\n",
strrchr(idat->fn, '/') ? strrchr(idat->fn, '/') + 1 : idat->fn, idat->num_snps, idat->num_mid_blocks,
idat->red_green[0], idat->red_green[1], idat->red_green[2], idat->red_green[3],
idat->snp_manifest ? idat->snp_manifest : "", idat->sentrix_barcode, idat->chip_type,
idat->sentrix_position, idat->beadset ? idat->beadset : "", idat->sample_name ? idat->sample_name : "",
idat->description ? idat->description : "", idat->sample_plate ? idat->sample_plate : "",
idat->sample_well ? idat->sample_well : "", idat->sample_count, idat->vln ? idat->vln : "",
idat->chip_type_guess ? idat->chip_type_guess : "Unknown", idat->imaging_date ? idat->imaging_date : "",
idat->scanner_data ? idat->scanner_data : "");
}
}
/****************************************
* GTC FILE IMPLEMENTATION *
****************************************/
// http://github.com/broadinstitute/picard/blob/master/src/main/java/picard/arrays/illumina/InfiniumGTCFile.java
// http://github.com/Illumina/BeadArrayFiles/blob/develop/docs/GTC_File_Format_v5.pdf
// http://github.com/Illumina/BeadArrayFiles/blob/develop/module/GenotypeCalls.py
#define NUM_SNPS 1
#define PLOIDY 2 // AutoConvert 2.0
#define PLOIDY_TYPE 3 // AutoConvert 2.0
#define GTC_SAMPLE_NAME 10
#define GTC_SAMPLE_PLATE 11
#define GTC_SAMPLE_WELL 12
#define CLUSTER_FILE 100
#define GTC_SNP_MANIFEST 101
#define IMAGING_DATE 200
#define AUTOCALL_DATE 201
#define AUTOCALL_VERSION 300
#define NORMALIZATION_TRANSFORMS 400
#define CONTROLS_X 500
#define CONTROLS_Y 501
#define RAW_X 1000
#define RAW_Y 1001
#define GENOTYPES 1002
#define BASE_CALLS 1003
#define GENOTYPE_SCORES 1004
#define SCANNER_DATA 1005
#define CALL_RATE 1006
#define GENDER 1007
#define LOGR_DEV 1008
#define GC10 1009
#define DX 1010
#define SAMPLE_DATA 1011
#define B_ALLELE_FREQS 1012 // AutoConvert 2.0
#define LOGR_RATIOS 1013 // AutoConvert 2.0
#define PERCENTILES_X 1014 // AutoConvert 2.0
#define PERCENTILES_Y 1015 // AutoConvert 2.0
#define SLIDE_IDENTIFIER 1016 // AutoConvert 2.0
// static const char *code2genotype[] = {
// "NC", "AA", "AB", "BB", "NULL", "A", "B", "AAA",
// "AAB", "ABB", "BBB", "AAAA", "AAAB", "AABB", "ABBB", "BBBB",
// "AAAAA", "AAAAB", "AAABB", "AABBB", "ABBBB", "BBBBB", "AAAAAA", "AAAAAB",
// "AAAABB", "AAABBB", "AABBBB", "ABBBBB", "BBBBBB", "AAAAAAA", "AAAAAAB", "AAAAABB",
// "AAAABBB", "AAABBBB", "AABBBBB", "ABBBBBB", "BBBBBBB", "AAAAAAAA", "AAAAAAAB", "AAAAAABB",
// "AAAAABBB", "AAAABBBB", "AAABBBBB", "AABBBBBB", "ABBBBBBB", "BBBBBBBB"};
typedef struct {
int32_t version;
float offset_x;
float offset_y;
float scale_x;
float scale_y;
float shear;
float theta;
float cvx;
float cvy;
float nn12;
float rr12;
float taa;
float tbb;
} XForm;
typedef char BaseCall[2];
typedef struct {
char *scanner_name;
int32_t pmt_green;
int32_t pmt_red;
char *scanner_version;
char *imaging_user;
} ScannerData;
typedef struct {
float p50gc;
int32_t num_calls;
int32_t num_no_calls;
int32_t num_intensity_only;
} SampleData;
typedef uint16_t Percentiles[3];
typedef struct {
char *fn;
hFILE *hfile;
int32_t version;
int32_t number_toc_entries;
uint16_t *id;
int32_t *toc;
int32_t num_snps;
int32_t ploidy;
int32_t ploidy_type;
char *sample_name;
char *sample_plate;
char *sample_well;
char *cluster_file;
char *snp_manifest;
char *imaging_date;
char *autocall_date;
char *autocall_version;
XForm *normalization_transforms;
size_t m_normalization_transforms;
uint16_t *controls_x;
size_t m_controls_x;
uint16_t *controls_y;
size_t m_controls_y;
ScannerData scanner_data;
float call_rate;
char gender;
float logr_dev;
float p10gc;
int32_t dx;
SampleData sample_data;
Percentiles percentiles_x;
Percentiles percentiles_y;
char *sentrix_id;
char *display_name;
float *sin_theta; // precomputed sine transforms
float *cos_theta; // precomputed cosine transforms
uint16_t *raw_x;
size_t m_raw_x;
uint16_t *raw_y;
size_t m_raw_y;
uint8_t *genotypes;
size_t m_genotypes;
BaseCall *base_calls;
size_t m_base_calls;
float *genotype_scores;
size_t m_genotype_scores;
float *b_allele_freqs;
size_t m_b_allele_freqs;
float *logr_ratios;
size_t m_logr_ratios;
} gtc_t;
// returns the length of a string including the variable-length prefix encoding the number of characters
static int leb128_strlen(const char *s) {
if (!s) return 1;
size_t n = strlen(s);
size_t value = n++;
while (value >>= 7) n++;
return n;
}