-
Notifications
You must be signed in to change notification settings - Fork 0
/
save_company
116 lines (105 loc) · 2.77 KB
/
save_company
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
type -P miceweb &>/dev/null && ISMWINST=1 || ISMWINST=0
if [ $ISMWINST -eq 0 ]; then
>&2 echo "Error: miceweb is not installed (seeing https://github.com/Robotizing/MiceWeb/)"
fi
get_pages() {
prevfilesize=1000000000
minfilesize="$prevfilesize"
for p in {2..2000}
do
sleep 0.1
wget -E $1/page$p
if [ $? -ne 0 ]
then
break
fi
filesize="$(wc -c <"page$p.html")"
# echo "$filesize $prevfilesize $minfilesize $p"
if [ "$filesize" -eq "$prevfilesize" ]
then
let minfilesizeext=$minfilesize+8
if [ "$filesize" -le "$minfilesizeext" ]
then
if [ "$p" -ge "10" ]
then
break
fi
fi
fi
prevfilesize="$filesize"
if [ "$filesize" -lt "$minfilesize" ]
then
minfilesize="$filesize"
fi
done
}
COMPANY=$1
if [[ "$COMPANY" == "" ]]
then
echo "The script needs username as a parameter."
exit 1 # TODO: adjust error code
fi
SUBDIR="company_$COMPANY"
ADDR="https://habr.com/ru/companies/$COMPANY"
miceweb_save_urls() {
if [ $ISMWINST -eq 1 ]; then
miceweb save urls - <<< "$(echo $ADDR/profile/ ; echo $ADDR/articles/ ; echo $ADDR/news/ ; echo $ADDR/fans/all/rating/ ; echo $ADDR/workers/all/rating/)"
fi
}
create_directory_and_get_pages() {
local directory="$1"
local URL="$2"
sleep 2.5
mkdir "$directory"
cd "$directory"
wget -E "$URL"
get_pages "$URL"
cd ..
}
if [ -e "$SUBDIR" ]; then
echo "$SUBDIR already exist (completely downloaded, but when?)"
miceweb_save_urls
elif [ -e "$SUBDIR.zip" ]; then
echo "$SUBDIR already exist (completely downloaded and zipped)"
miceweb_save_urls
exit
elif [ -e "~$SUBDIR" ]; then
echo "~$SUBDIR already exist (incompletely downloaded), you should remove it"
exit 1
else
mkdir "~$SUBDIR"
cd "~$SUBDIR"
wget -E "$ADDR/profile/"
create_directory_and_get_pages articles "$ADDR/articles"
create_directory_and_get_pages news "$ADDR/news"
create_directory_and_get_pages fans "$ADDR/fans/all/rating"
create_directory_and_get_pages workers "$ADDR/workers/all/rating"
if [ $ISMWINST -eq 1 ]; then
#TODO: use --archives=all, save posts from next pages
miceweb_save_urls
miceweb save urls index.html
miceweb save urls articles/articles.html --base="https://habr.com" --grep="articles/"
miceweb save urls news/news.html --base="https://habr.com" --grep="news/"
fi
cd ..
mv "~$SUBDIR" "$SUBDIR"
fi
ZIPFILE="$SUBDIR.zip"
if [ -d "$SUBDIR" ]; then
if [ ! -e "$ZIPFILE" ]; then
zip -1 -T -r $ZIPFILE $SUBDIR
if [ $? -eq 0 ]; then
if [ ! -e "~$SUBDIR" ]; then
echo -n "Deleting ~$SUBDIR..."
mv "$SUBDIR" "~$SUBDIR"
rm -r "~$SUBDIR"
if [ $? -eq 0 ]; then
echo " Done"
else
echo ""
exit 1
fi
fi
fi
fi
fi