-
Notifications
You must be signed in to change notification settings - Fork 0
/
10.dedup.sh
executable file
·103 lines (91 loc) · 2.93 KB
/
10.dedup.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/bin/bash
source .env
source .checks
set -euo pipefail
langs=""
export IDLE_TIMEOUT=30s
if [[ $# -eq 2 ]]; then
langs=$1
fi
# Create the task list
# sort them by size to start with the biggest
entries=$(mktemp); trap "rm $entries" EXIT
dedup_entries=$(mktemp); trap "rm $dedup_entries" EXIT
temp=$(mktemp); trap "rm $temp" EXIT
for coll in `echo ${!COLLECTIONS[@]} | tr ' ' '\n' | sort`
do
for lang_dir in `echo $WORKSPACE/batches/$coll/* | tr ' ' '\n'`
do
size=`du -c $lang_dir/batch_*.jsonl.zst | tail -1 | cut -f1`
# For languages of more than 150GB run distributed index
if (( $size > 150000000 )); then
for i in `seq 1 17`; do
echo "$lang_dir $i"
done
else
echo $lang_dir
fi
done
done >$temp
cat $temp | shuf --random-source=<(get_seeded_random 42) >$entries
# create entries for dedup
cat $temp | cut -d' ' -f1 | uniq >$dedup_entries
echo $(wc -l $entries) tasks
confirm
# Create an allocation queue that will allocate a full node for each worker
# each worker will process one task
newqueue() {
local name=$1
local workers=$2
local mem=$3
hq alloc add slurm --name $name \
--workers-per-alloc 1 --max-worker-count $workers --backlog 10 \
--idle-timeout $IDLE_TIMEOUT --time-limit 72h \
-- -p small -A $SBATCH_ACCOUNT \
--cpus-per-task 128 --ntasks 1 --mem-per-cpu $mem \
-o "$SLURM_LOGS_DIR/hq-worker-%x.out" -e "$SLURM_LOGS_DIR/hq-worker-%x.err"
}
queueid() {
hq alloc list --output-mode json | jq -cr ".[] | select(.name == \"$1\") | .id" | head -1
}
### INDEX
WORKERS=200
queue_name=index
newqueue $queue_name $WORKERS 1750
# obtain the allocation queue id
qid=$(queueid $queue_name)
trap "hq job cancel all; hq alloc remove --force $qid" INT
set +e # remove strict mode, so if job fails, script does not finish and the queue can be closed afterwards
hq submit --each-line $entries \
--nodes 1 --progress \
--log=$SLURM_LOGS_DIR/hq-index.log \
--max-fails=10 --crash-limit=5 \
bash 10.index
set -e
# Wait until the queue workers are shut down
# sleep a bit more than timeout to avoid running the remove command while workers are still shutting down
sleep $IDLE_TIMEOUT
sleep 30s
# finish que allocation queue
hq alloc remove --force $qid
### DEDUP
# create a different queue for dedup, needs less resources
queue_name=dedup
newqueue $queue_name $WORKERS 3500
# obtain the allocation queue id
qid=$(queueid $queue_name)
trap - INT
trap "hq job cancel all; hq alloc remove --force $qid" INT
set +e
hq submit --each-line $dedup_entries \
--nodes 1 --progress \
--log=$SLURM_LOGS_DIR/hq-dedup.log \
--max-fails=10 --crash-limit=5 \
bash 10.dedup
set -e
# Wait until the queue workers are shut down
# sleep a bit more than timeout to avoid running the remove command while workers are still shutting down
sleep $IDLE_TIMEOUT
sleep 30s
# finish que allocation queue
hq alloc remove $qid