-
Notifications
You must be signed in to change notification settings - Fork 0
/
10.index
55 lines (48 loc) · 1.25 KB
/
10.index
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/bin/bash
#SBATCH --job-name=index
#SBATCH --partition="small"
#SBATCH --time=72:00:00
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=128
## 3750 mem to go for 512gb nodes
#SBATCH --mem-per-cpu=1750
#SBATCH --output=logs/%x.out
set -euo pipefail
shopt -s failglob
source .env
IFS=" " read -ra args <<< $HQ_ENTRY
#L=${args[0]}
#COLL=${args[1]}
TMPSFX=tmp_$$
INPUT_DIR=${args[0]}
JSON_FILES=`ls -1 $INPUT_DIR/batch_*.jsonl.zst`
echo $JSON_FILES >&2
# If it's an array job, do distributed index
if [[ ${#args[@]} -eq 2 ]]; then
band=${args[1]}
PARAMS="-b $((band-1))"
OUTPUT_FILE=$INPUT_DIR/clusters.${args[1]}.zst
else
PARAMS=""
OUTPUT_FILE=$INPUT_DIR/clusters.zst
fi
if test -s $OUTPUT_FILE; then
echo Clusters file already exists, exiting... $OUTPUT_FILE >&2
exit 0
fi
#case "$L" in
## zh | ja | th)
## PARAMS="$PARAMS --tokenizer char";;
# *)
# PARAMS="$PARAMS --tokenizer whitespace";;
#esac
# Build the index and save clusters array
singularity exec --bind $(pwd -P) --bind $INPUT_DIR --pwd $(pwd -P) monotextor.sif \
mhindex --batch-size 20000 $PARAMS $JSON_FILES \
| zstd -10 -T64 \
>$OUTPUT_FILE.$TMPSFX \
|| {
echo "Error in pipeline: ${PIPESTATUS[@]}" >&2
exit 1
}
mv $OUTPUT_FILE.$TMPSFX $OUTPUT_FILE