-
Notifications
You must be signed in to change notification settings - Fork 4
/
cocochorales_downloader.sh
executable file
·47 lines (40 loc) · 2.34 KB
/
cocochorales_downloader.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/bin/bash
# Mostly based on the cocochorales download script. We only care about the wav files so we don't
# need to download the other files related to midi.
# The main dataset is too big for fsx (500+ GB) but in each sample we only need the wav stems,
# not the full mix. After doing this trimming we get around 5.5 GB per folder, so we'll download from 1-60 inclusive = 330 GB to be on the safe side.
# In any case, the audiolm_pytorch code auto-separates code into train/valid/test sets. In reality there are 1-96 for train, and then 1-12 for valid and test each, but we'll just use the first 60 for now.
mkdir /fsx/itsleonwu/audiolm-pytorch-datasets/cocochorales_main_dataset_v1 # unzipped
mkdir /fsx/itsleonwu/audiolm-pytorch-datasets/cocochorales_main_dataset_v1_zipped
cd /fsx/itsleonwu/audiolm-pytorch-datasets/cocochorales_main_dataset_v1_zipped
# download md5 if it doesn't exist already
if [ ! -f "cocochorales_md5s.txt" ]; then
wget https://storage.googleapis.com/magentadata/datasets/cocochorales/cocochorales_full_v1_zipped/cocochorales_md5s.txt
fi
# download main dataset, specifically train
for i in $(seq 1 1 60); do
# wget if $i.tar.bz2 doesn't exist in cocochorales_main_dataset_v1_zipped
if [ ! -f "$i".tar.bz2 ]; then
wget https://storage.googleapis.com/magentadata/datasets/cocochorales/cocochorales_full_v1_zipped/main_dataset/train/"$i".tar.bz2
fi
# copy to s3, zipped. only run this once!
# aws s3 cp /fsx/itsleonwu/audiolm-pytorch-datasets/cocochorales_main_dataset_v1_zipped/"$i".tar.bz2 s3://s-laion/itsleonwu-laion/cocochorales_main_dataset_v1_zipped --profile laion-stability-my-s3-bucket
cd ../cocochorales_main_dataset_v1
mkdir $i
tar -xjf ../cocochorales_main_dataset_v1_zipped/$i.tar.bz2 -C ./$i
# copy to s3, unzipped. only run this once!
aws s3 cp /fsx/itsleonwu/audiolm-pytorch-datasets/cocochorales_main_dataset_v1/"$i" s3://itsleonwu-laion/cocochorales_main_dataset_v1/$i --recursive --profile laion-stability-my-s3-bucket
# Cleanup: keep only stem wavs
cd $i
# Loop through subfolders
for subfolder in */; do
cd "$subfolder"
cp stems_audio/*.wav . # copy to subfolder
# Delete all other files
rm -rf stems_audio stems_midi metadata.yaml mix.mid mix.wav
cd .. # back to $i
done
cd .. # back to cocochorales_main_dataset_v1
echo "completed $i"
cd ../cocochorales_main_dataset_v1_zipped
done