-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse_UN_html.py
158 lines (123 loc) · 4.13 KB
/
parse_UN_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import re
from bs4 import BeautifulSoup
import requests
from urlREGEX import URL_REGEX
import os
import shutil
import argparse
# http_proxy = "http://10.3.100.207:8080"
# https_proxy = "https://10.3.100.207:8080"
# ftp_proxy = "ftp://10.3.100.207:8080"
http_proxy = ""
https_proxy = ""
ftp_proxy = ""
proxyDict = {
"http": http_proxy,
"https": https_proxy,
"ftp": ftp_proxy
}
def getUniqueItems(iterable):
result = []
for item in iterable:
if item not in result:
result.append(item)
return result
parser = argparse.ArgumentParser(description='Adding course_url and dest_folder')
parser.add_argument('-u', '--url', help='URL')
parser.add_argument('-d', '--dest', help='destination folder name', required=True)
parser.add_argument('-s', '--start', help='start_lesson', default=0)
args = vars(parser.parse_args())
# base_url = 'http://unacademy.in'
# course_url = args['url']
# course_url = 'https://unacademy.com/course/january-2017-daily-summary-and-analysis-of-the-hindu/Q8Q3FZIV'
# r = requests.get(course_url, proxies=proxyDict)
base_url = ''
with open(args['url']) as f:
soup = BeautifulSoup(f, "lxml")
lessons = []
for a in soup.find_all('a', href=True):
lessons.append(a['href'])
# print(lessons)
def set_start_url(img_start_url):
img_url_split = img_start_url.split('/')
last_seg = img_url_split[-1].split('.')
last_seg[0] = str(0)
img_url_split[-1] = '.'.join(last_seg)
img_next_url = '/'.join(img_url_split)
return img_next_url
def alternate(img_url):
img_url_split = img_url.split('/')
last_seg = img_url_split[-1].split('.')
if(last_seg[1] == 'png'):
last_seg[1] = 'jpeg'
elif (last_seg[1] == 'jpeg'):
last_seg[1] = 'png'
img_url_split[-1] = '.'.join(last_seg)
img_alternate_url = '/'.join(img_url_split)
return img_alternate_url
def get_next_url(img_start_url):
img_url_split = img_start_url.split('/')
last_seg = img_url_split[-1].split('.')
jpeg_num = int(last_seg[-2])
last_seg[0] = str(jpeg_num + 1)
img_url_split[-1] = '.'.join(last_seg)
img_next_url = '/'.join(img_url_split)
return img_next_url
# TODO : If dir does not exist, create it.
def download(url_as_str, dir='./images'):
# headers = get_headers()
cmd = ''
headers = []
filename = url_as_str.split('/')[-1]
if not os.path.exists(filename):
cmd = "wget -P " + dir + " -A jpg,jpeg,gif,png " + url_as_str
return os.system(cmd)
def download_all(img_start_url,change=0):
"""
Takes STRING url
"""
response = download(img_start_url)
if(change == 0):
if (response == 2048):
response = download_all(alternate(img_start_url),1)
else:
return download_all(get_next_url(img_start_url),0)
elif (change == 1):
if (response == 2048):
return True
else:
return download_all(get_next_url(img_start_url),0)
# Added additional caretaking for ?source=Course
lesson_list = [l for l in lessons if '/lesson/' in l and "?source" not in l]
lesson_list = getUniqueItems(lesson_list)
lesson_urls = [base_url + l for l in lesson_list]
print("\n",lesson_urls)
def get_img_url(lesson_url):
imgs = []
lesson_soup = BeautifulSoup(requests.get(
lesson_url, proxies=proxyDict).content, "lxml")
image = lesson_soup.find(itemprop='image')
img_url = re.findall(URL_REGEX, str(image))[0]
return img_url
def mv_lessonwise(destname):
"""
Move .jpeg,.png images to dest folder.
"""
cur_path = os.getcwd() + "/images/"
target = cur_path + destname
print("\n\n MOVING FILES into " + target)
if not os.path.exists(target):
os.makedirs(target)
imgs = [f for f in os.listdir(cur_path) for a in ['.jpeg','.png'] if f.endswith(a)]
images = [cur_path + i for i in imgs]
final_images = [shutil.move(img, target) for img in images]
i = int(args['start'])
print("i=",i,len(lesson_urls[int(args['start'])-1:]))
for lesson_url in lesson_urls[int(args['start'])-1:]:
img_start_url = set_start_url(get_img_url(lesson_url))
confirm = download_all(img_start_url,0)
mv_lessonwise(args['dest'] + str(i))
i = i + 1
# lesson_url = lesson_urls[2]
# print(img_start_url)
print("\n\n\n" + str(confirm))