-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlister.py
54 lines (47 loc) · 1.86 KB
/
lister.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from urllib.request import urlretrieve
import requests
import time
import pickle
import os
from selenium import webdriver
browser = webdriver.Firefox()
url = 'http://bbcsfx.acropolis.org.uk/?page='
page_num = 1
page_lim = 641
results = []
for page in range(page_num, page_lim+1):
# iterate through pages
# fetching page by page
print(f'Getting page {page}')
browser.get(url + str(page))
# wait for the page to load
time.sleep(8)
if page != 641:
# other than page 641, grab 25 labels and 25 urls to download per page
for i in range(1,26):
try:
href = browser.find_element_by_xpath(f'/html/body/section/div/div/div[3]/table/tbody/tr[{i}]/td[5]/a')
href = href.get_property('href')
label = browser.find_element_by_xpath(f'/html/body/section/div/div/div[3]/table/tbody/tr[{i}]/td[1]')
label = label.text
print(f'appending ({href}, {label})')
results.append((href, label))
except:
print('skipping element for exception')
with open('list.pkl', 'wb') as f:
pickle.dump(results, f)
print('pickling list')
elif page == 641:
# but on the last page, there are only 11 files
for i in range(1,12): # hardcoded cuz we only have one endpage
href = browser.find_element_by_xpath(f'/html/body/section/div/div/div[3]/table/tbody/tr[{i}]/td[5]/a')
href = href.get_property('href')
label = browser.find_element_by_xpath(f'/html/body/section/div/div/div[3]/table/tbody/tr[{i}]/td[1]')
label = label.text
print(f'appending ({href}, {label})')
results.append((href, label))
else:
print('something is wrong with your if statement')
with open('list.pkl', 'wb') as f:
pickle.dump(results, f)
print('pickling list')