forked from tensorflow/tensor2tensor
-
Notifications
You must be signed in to change notification settings - Fork 1
/
smart_datagen.sh
executable file
·100 lines (90 loc) · 3.88 KB
/
smart_datagen.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/bin/bash
if [[ $# -eq 3 ]] ; then
if [[ ${!#} == "--dry-run" ]]; then
printf "Invalid arguments provided. Signature is:\n\
./$(basename "$0") <DATA_DIR> <SPECIFIC_SPLIT> (--dry-run)\n\
E.g. ./$(basename "$0") t2t-data True\n"
else
export VM_IP=$(echo $SSH_CONNECTION | sed "s/^.* \([0-9|\.]*\) [0-9]*$/\1/")
export VM_INFO=$(gcloud compute instances list | grep $VM_IP)
export VM_NAME=$(echo $VM_INFO | cut -d' ' -f1)
export VM_ZONE=$(echo $VM_INFO | cut -d' ' -f2)
export TPU_INFO=$(gcloud compute tpus list --zone=$VM_ZONE | grep $VM_NAME)
export TPU_IP=$(echo $TPU_INFO | sed "s/^.*v[0-9].*\s\([0-9]*\.[0-9]*\.[0-9]*\.[0-9]*\):[0-9]*\s.*$/\1/")
export TPU_NAME=$(echo $TPU_INFO | cut -d' ' -f1)
if [[ $VM_ZONE == "us-central1-a" || $VM_ZONE == "us-central1-b" || $VM_ZONE == "us-central1-c" || $VM_ZONE == "us-central1-f" ]] ; then
export STORAGE_BUCKET=gs://us_bucketbucket
elif [[ $VM_ZONE == "europe-west4-a" ]] ; then
export STORAGE_BUCKET=gs://mathsreasoning
else
echo
echo
echo
echo "ZONE variable is weird... ZONE = "$ZONE
echo
echo
echo
fi
export PROBLEM=algorithmic_math_deepmind_all
# datagen will recognise the "easy part and select only the train-easy data"
export DATA_DIR=${STORAGE_BUCKET}/$1 # where the dataset goes
export TMP_DIR=${STORAGE_BUCKET} # /mathematics_dataset-v1.0 - where the data comes from
export SPECIFIC_SPLIT=$3
export TASK_DIRECTION=$2
echo
echo "Running..."
echo "t2t-datagen \\"
echo " --problem=$PROBLEM \\"
echo " --data_dir=$DATA_DIR \\"
echo " --tmp_dir=$TMP_DIR (/mathematics_dataset-v1.0) \\"
echo " --specific_splits=$SPECIFIC_SPLIT \\"
echo " --task_direction=$TASK_DIRECTION"
echo
t2t-datagen \
--problem=$PROBLEM \
--data_dir=$DATA_DIR \
--tmp_dir=$TMP_DIR \
--specific_split=$SPECIFIC_SPLIT \
--task_direction=$TASK_DIRECTION
fi
elif [[ $# -eq 4 && ${!#} == "--dry-run" ]]; then
export VM_IP=$(echo $SSH_CONNECTION | sed "s/^.* \([0-9|\.]*\) [0-9]*$/\1/")
export VM_INFO=$(gcloud compute instances list | grep $VM_IP)
export VM_NAME=$(echo $VM_INFO | cut -d' ' -f1)
export VM_ZONE=$(echo $VM_INFO | cut -d' ' -f2)
export TPU_INFO=$(gcloud compute tpus list --zone=$VM_ZONE | grep $VM_NAME)
export TPU_IP=$(echo $TPU_INFO | sed "s/^.*v[0-9].*\s\([0-9]*\.[0-9]*\.[0-9]*\.[0-9]*\):[0-9]*\s.*$/\1/")
export TPU_NAME=$(echo $TPU_INFO | cut -d' ' -f1)
if [[ $VM_ZONE == "us-central1-a" || $VM_ZONE == "us-central1-b" || $VM_ZONE == "us-central1-c" || $VM_ZONE == "us-central1-f" ]] ; then
export STORAGE_BUCKET=gs://us_bucketbucket
elif [[ $VM_ZONE == "europe-west4-a" ]] ; then
export STORAGE_BUCKET=gs://mathsreasoning
else
echo
echo
echo
echo "ZONE variable is weird... ZONE = "$ZONE
echo
echo
echo
fi
export PROBLEM=algorithmic_math_deepmind_all
# datagen will recognise the "easy part and select only the train-easy data"
export DATA_DIR=${STORAGE_BUCKET}/$1
export TMP_DIR=${STORAGE_BUCKET}
export SPECIFIC_SPLIT=$3
export TASK_DIRECTION=$2
echo
echo "Process will run the following:"
echo "t2t-datagen \\"
echo " --problem=$PROBLEM \\"
echo " --data_dir=$DATA_DIR \\"
echo " --tmp_dir=$TMP_DIR (/mathematics_dataset-v1.0) \\"
echo " --specific_splits=$SPECIFIC_SPLIT \\"
echo " --task_direction=$TASK_DIRECTION"
echo
else
printf "Invalid arguments provided. Signature is:\n\
./$(basename "$0") <DATA_DIR> <task> <SPECIFIC_SPLIT> (--dry-run)\n\
E.g. ./$(basename "$0") t2t-data Q12 True\n"
fi