-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset.jl
133 lines (120 loc) · 3.59 KB
/
dataset.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
using DataStructures
function parseSnapData(fname::String)
PP, T, PP_test, T_test, n_test = trainTestSplitSnapData(fname, 1.0)
return PP, T
end
function trainTestSplitSnapData(fname::String, split_prop::Float64=0.8)
N = countlines(open(fname))
split_n = Int(round(N*split_prop))
n_test = N - split_n
f = open(fname)
arrival_times = OrderedDict{String, Int}()
degrees = OrderedDict{String, Int}()
#scope
T = nothing
PP = nothing
for (i, ln) in enumerate(eachline(f))
a = split(ln)
start = a[1]
terminal = a[2]
if ~haskey(arrival_times, start)
arrival_times[start] = 2*i - 1
degrees[start] = 1
else
degrees[start] += 1
end
if ~haskey(arrival_times, terminal)
arrival_times[terminal] = 2*i
degrees[terminal] = 1
else
degrees[terminal] += 1
end
if i == split_n
T = collect(values(arrival_times))
PP = collect(values(degrees))
end
end
# T_test is a pure extension of T
T_test = collect(values(arrival_times))
# PP is a pure extension of PP
PP_test = collect(values(degrees))
return PP, T, PP_test, T_test, 2*split_n, 2*n_test
end
function generateInterarrivalTimes(TK::Char, N::Int, interarrival_dist::DiscreteDistribution)
ia_dist = (x,y) -> interarrival_dist
generateInterarrivalTimes(TK,N,ia_dist)
end
function generateInterarrivalTimes(TK::Char, N::Int, interarrival_dist::Function)
"""
- `TK`: 'T' if `N` is the total number of observations;
'K' if `N` is the total number of arrivals
- `N`: number of arrival times to generate (modulated by `TK`)
- `interarrival`: distribution object to generate i.i.d. interarrivals
"""
# check function arguments
TK != 'K' && TK != 'T' ? error("`TK` must be 'T' or 'K'") : nothing
zero_shift = Int(minimum(interarrival_dist(1,1)) == 0)
if TK == 'K'
T = zeros(Int64,N)
T[1] = 1 # first arrival time is always 1
for j in 2:N
T[j] = rand(interarrival_dist(T[j-1],j-1)) + T[j-1] + zero_shift
end
# return T
else
T = [1]
j = 1
while T[j] < N
j += 1
append!(T,rand(interarrival_dist(T[j-1],j-1)) + T[j-1] + zero_shift)
end
if T[end] > N
pop!(T)
end
# return T
end
return T
end
function generateLabelSequence(N::Int, alpha::Float64,
interarrival_dist::DiscreteDistribution)
ia_dist = (x,y) -> interarrival_dist
generateLabelSequence(N,alpha,ia_dist)
end
function generateLabelSequence(N::Int, alpha::Float64,
interarrival_dist::Function)
"""
- `N`: number of observations in the sequence
- `alpha`: BNTL α
- `interarrival_dist`: distribution object to generate interarrivals
"""
Z = zeros(Int, N) # sequence of labels
T = generateInterarrivalTimes('T', N, interarrival_dist)
K = size(T,1) # number of clusters
PP = zeros(Int, K) # arrival-ordered partition counts
k = 0
for n in 1:N
if n <= T[end] && n == T[k+1]
k += 1
PP[k] = 1
Z[n] = k
k > K ? k = K : nothing
else
Z[n] = wsample(1:k, PP[1:k] .- alpha) # discounted size-biased sample
PP[Z[n]] += 1
end
end
return Z, PP, T
end
function generatePsis(T::Vector{Int},alpha::Float64)
"""
- `T`: Arrival times
- `alpha`: 'discount' parameter
"""
K = size(T,1)
Psi = zeros(Float64,K)
Psi[1] = 1
for j in 2:K
Psi[j] = rand(Beta(1 - alpha, T[j] - 1 - (j-1)*alpha))
end
return Psi
end