-
Notifications
You must be signed in to change notification settings - Fork 0
/
meth_hdf5_part1.jl
153 lines (130 loc) · 3.78 KB
/
meth_hdf5_part1.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# This file takes an input directory (IN_DIR) with output file from
# Bismarck (one for each cell) and produces intermediate files in an
# output directory (OUT_DIR), which can then be consolidated into a
# HDF5 file by the "part2" script.
const IN_DIR = "eb/" # do not omit trailing slash
const OUT_DIR = "bstmp/" # ditto; must exist and should be empty
import GZip
import CSV
const eof_marker = "zzzz"
# This eof_marker should sort later than all chromosome names.
struct GenomicPosition
chrom ::String
pos ::UInt32
end
function Base.isless( gp1 ::GenomicPosition, gp2 ::GenomicPosition )
if gp1.chrom == gp2.chrom
return gp1.pos < gp2.pos
else
return gp1.chrom < gp2.chrom
end
end
function Base.show( io ::IO, gp ::GenomicPosition )
print( io, gp.chrom, ":", gp.pos )
end
struct BismarckRecord
gp ::GenomicPosition
count_unmeth ::UInt8
count_meth ::UInt8
end
struct BismarckInput
io ::IO
_prev_chrom ::Base.RefValue{String}
function BismarckInput( io ::IO )
firstline = readline( io )
if firstline != "chr\tpos\tmet_reads\tnonmet_reads\trate\n"
error( "Wrong file formar" )
end
return new( io, Ref("") )
end
end
function next( b ::BismarckInput )
if eof(b.io)
return BismarckRecord( GenomicPosition(eof_marker, 0), 0, 0 )
end
line = readline( b.io )
if line == "" || line == "\n"
return BismarckRecord( GenomicPosition(eof_marker, 0), 0, 0 )
end
fields = split( line, "\t" )
if length(fields) != 5
println( ">",line,"<")
error( "Input line does not contain 5 fields." )
end
gp = GenomicPosition( fields[1], parse( Int, fields[2] ) )
br = BismarckRecord( gp, parse( UInt8, fields[4] ), parse( UInt8, fields[3] ) )
if gp.chrom != b._prev_chrom[]
# New chrosome starts, do a few checks
if gp.chrom > eof_marker
error( "Chromosome name sorts after EOF marker." )
end
if gp.chrom < b._prev_chrom[]
error( "Chromosomes are not lexically sorted in input file." )
end
b._prev_chrom[] = gp.chrom
end
return br
end
function main()
# Open input files
input_filenames = readdir( IN_DIR )
input_files = [ GZip.open( IN_DIR*fn ) for fn in input_filenames ]
inputs = [ BismarckInput(f) for f in input_files ]
# Write cell list
cell_list = open( OUT_DIR*"cells.lst", "w" )
for fn in input_filenames
println( cell_list, fn )
end
close( cell_list )
# Open other output files
values_out = open( OUT_DIR*"values.bin", "w" )
index_out = nothing # will be opened later
cur_idx::UInt32 = 0
chrom_list_out = open( OUT_DIR*"chromosomes.csv", "w" )
currec = [ next(bi) for bi in inputs ]
prev_chrom = ""
while true
curpos = minimum( rc.gp for rc in currec )
if curpos.chrom == eof_marker
break
end
if curpos.chrom != prev_chrom
# We start a new chromosome
if index_out != nothing
close( index_out )
println( chrom_list_out, prev_chrom, ",", cur_idx )
println( "Finished processing chromosome ", prev_chrom)
end
prev_chrom = curpos.chrom
index_out = open( OUT_DIR*curpos.chrom*".idx", "w" )
end
#println( curpos )
write( index_out, UInt32(curpos.pos), UInt32(cur_idx) )
for i in 1:length(inputs)
if currec[i].gp == curpos
#println( " cell ", i, ": ", currec[i].count_unmeth, "+", currec[i].count_meth )
currec[i] = next( inputs[i] )
write( values_out, UInt16(i), UInt8(currec[i].count_unmeth), UInt8(currec[i].count_meth) )
cur_idx += 1
end
end
end
for f in input_files
close(f)
end
close( index_out )
close( values_out )
println( chrom_list_out, prev_chrom, ",", cur_idx )
println( "Finished processing chromosome ", prev_chrom )
close( chrom_list_out )
end
main()
# Description of output format:
#
# values.bin
# each record has 4 Bytes, as follows:
# Bytes 0,1: cell number, zero-based, UInt16
# Byte 2: count unmethylated reads
# Byte 3: count methylated reads
#
#