diff --git a/fetch_raw_fast.sh b/fetch_raw_fast.sh new file mode 100755 index 0000000..0c48ed8 --- /dev/null +++ b/fetch_raw_fast.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +export MAX_THREADS=100 + +main() { + mkdir -p raw_dumps && cd url_dumps + cat $(ls -d $PWD/* | grep ".deduped.txt") | sed -E "p;s/[\/:]/-/g" | paste -d ' ' - - | xargs -n 2 -P $MAX_THREADS -l bash -c 'curl -o ../raw_dumps/$1-$(date | tr " " "-" | tr ":" "_").html $0' +} + +# Execute main function +main diff --git a/progress.sh b/progress.sh new file mode 100755 index 0000000..88fc4b0 --- /dev/null +++ b/progress.sh @@ -0,0 +1,2 @@ +#!/bin/bash +echo "print('{:.2f} minutes'.format(int(" $(date +%s%N) - $(cat start-time.txt) ") / 1e9 / 60));" "print(int(" $(du -s raw_dumps | cut -f1) "), 'bytes\n', int(" $(ls -1 raw_dumps | wc -l) "), 'files')" | python3