From bb269556af0882348c015701127ce34891322a44 Mon Sep 17 00:00:00 2001 From: Simon Fallnich Date: Mon, 4 Mar 2019 12:47:30 +0100 Subject: [PATCH 1/3] Added fast fetching --- fetch_raw_fast.sh | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 fetch_raw_fast.sh diff --git a/fetch_raw_fast.sh b/fetch_raw_fast.sh new file mode 100644 index 0000000..6fd6296 --- /dev/null +++ b/fetch_raw_fast.sh @@ -0,0 +1,3 @@ +#!/bin/bash +mkdir raw_dump && cd raw_dump +cat $(ls ../urldumps | grep ".deduped.txt") | xargs curl $1 -o $1-$(date +"%T").html From da91fcbe212d4035b84e6dcf949e509660ab4465 Mon Sep 17 00:00:00 2001 From: Simon Fallnich Date: Mon, 4 Mar 2019 16:30:20 +0100 Subject: [PATCH 2/3] Added fast fetching tool --- fetch_raw_fast.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) mode change 100644 => 100755 fetch_raw_fast.sh diff --git a/fetch_raw_fast.sh b/fetch_raw_fast.sh old mode 100644 new mode 100755 index 6fd6296..0c48ed8 --- a/fetch_raw_fast.sh +++ b/fetch_raw_fast.sh @@ -1,3 +1,11 @@ #!/bin/bash -mkdir raw_dump && cd raw_dump -cat $(ls ../urldumps | grep ".deduped.txt") | xargs curl $1 -o $1-$(date +"%T").html + +export MAX_THREADS=100 + +main() { + mkdir -p raw_dumps && cd url_dumps + cat $(ls -d $PWD/* | grep ".deduped.txt") | sed -E "p;s/[\/:]/-/g" | paste -d ' ' - - | xargs -n 2 -P $MAX_THREADS -l bash -c 'curl -o ../raw_dumps/$1-$(date | tr " " "-" | tr ":" "_").html $0' +} + +# Execute main function +main From 713708f2f9d3decc8816459d910ff17808db78b4 Mon Sep 17 00:00:00 2001 From: Simon Fallnich Date: Tue, 5 Mar 2019 14:27:06 +0000 Subject: [PATCH 3/3] Added progress script --- progress.sh | 2 ++ 1 file changed, 2 insertions(+) create mode 100755 progress.sh diff --git a/progress.sh b/progress.sh new file mode 100755 index 0000000..88fc4b0 --- /dev/null +++ b/progress.sh @@ -0,0 +1,2 @@ +#!/bin/bash +echo "print('{:.2f} minutes'.format(int(" $(date +%s%N) - $(cat start-time.txt) ") / 1e9 / 60));" "print(int(" $(du -s raw_dumps | cut -f1) "), 'bytes\n', int(" $(ls -1 raw_dumps | wc -l) "), 'files')" | python3