-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_datasets
executable file
·112 lines (84 loc) · 2.86 KB
/
get_datasets
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env fish
set root "datasets"
### sparse matrices from SuiteSparse: https://sparse.tamu.edu/ML_Graph
set sparse_root "$root/suitesparse"
# create folders
mkdir -p $sparse_root
set urls \
"http://sparse-files.engr.tamu.edu/MM/ML_Graph/JapaneseVowelsSmall_10NN.tar.gz" \
"http://sparse-files.engr.tamu.edu/MM/ML_Graph/yeast_30NN.tar.gz" \
"http://sparse-files.engr.tamu.edu/MM/ML_Graph/Plants_10NN.tar.gz" \
"http://sparse-files.engr.tamu.edu/MM/ML_Graph/mice_10NN.tar.gz" \
"http://sparse-files.engr.tamu.edu/MM/ML_Graph/iris_dataset_30NN.tar.gz"
for url in $urls
set fname (string split "/" -- $url)[-1]
set name (string split "." -- $fname)[1]
set path "$sparse_root/$name"
# if not downloaded, download and extract
if test ! -d $path
curl $url --output $fname
tar -xvf $fname -C "$sparse_root"
rm $fname
end
end
### datasets from the book "Gaussian Processes for Machine Learning":
# https://gaussianprocess.org/gpml/data/
set gpml_root "$root/gpml"
mkdir -p $gpml_root
set urls \
"https://gaussianprocess.org/gpml/data/sarcos_inv.mat" \
"https://gaussianprocess.org/gpml/data/sarcos_inv_test.mat"
for url in $urls
set fname (string split "/" -- $url)[-1]
set path "$gpml_root/$fname"
# if not downloaded, download
if test ! -e $path
curl $url --output "$path"
end
end
### datasets from LIBSVM
# https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/
set libsvm_root "$root/libsvm/cod-rna"
mkdir -p $libsvm_root
set urls \
"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/cod-rna" \
"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/cod-rna.t" \
"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/cod-rna.r"
for url in $urls
set fname (string split "/" -- $url)[-1]
set path "$libsvm_root/$fname"
# if not downloaded, download
if test ! -e $path
curl $url --output "$path"
end
end
### datasets from the UCI machine learning repository:
# https://archive.ics.uci.edu/ml/datasets.php
set uci_root "$root/uci"
mkdir -p $uci_root
set uci_url "https://archive.ics.uci.edu/ml/machine-learning-databases"
## Statlog (Shuttle) dataset
set shuttle_root "$uci_root/shuttle"
mkdir -p $shuttle_root
set url1 "$uci_url/statlog/shuttle/shuttle.trn.Z"
set url2 "$uci_url/statlog/shuttle/shuttle.tst"
set path1 "$shuttle_root/shuttle.trn"
set path1_compressed "$shuttle_root/shuttle.trn.Z"
set path2 "$shuttle_root/shuttle.tst"
if test ! -e $path1
curl $url1 --output $path1_compressed
gzip -d $path1_compressed
end
if test ! -e $path2
curl $url2 --output "$path2"
end
### SUSY dataset
set susy_root "$uci_root/susy"
mkdir -p $susy_root
set url1 "$uci_url/00279/SUSY.csv.gz"
set path1 "$susy_root/SUSY.csv"
set path1_compressed "$susy_root/SUSY.csv.gz"
if test ! -e $path1
curl $url1 --output $path1_compressed
gzip -d $path1_compressed
end