-
Notifications
You must be signed in to change notification settings - Fork 7
/
crawl.py
52 lines (44 loc) · 1.86 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
def getHtml(url):
# Mozilla/5.0 (Windows NT 10.0; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0'}
try:
response = requests.get(url,timeout=40,headers=headers)
response.raise_for_status()
response.encoding = response.apparent_encoding
return response.text
except:
import traceback
traceback.print_exc()
with open('data.txt','r') as f:
data=f.read().splitlines()
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0'}
browser = webdriver.Chrome('C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
urlBase='https://ieeexplore.ieee.org/search/searchresult.jsp?action=search&newsearch=true&searchField=Search_All&matchBoolean=true&queryText="DOI":'
for i in range(len(data)):
doi=data[i]
url=urlBase+doi
browser.get(url)
time.sleep(5)
link_list=browser.find_element_by_xpath("//*[@data-artnum]")
if link_list=='':
print('Failed to download the {}-th paper'.format(i))
continue
arcNum=link_list.get_attribute('data-artnum')
pdfUrl='http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber='+arcNum
soup = BeautifulSoup(getHtml(pdfUrl), 'html.parser')
result = soup.body.find_all('iframe')
print(arcNum,result)
if result==[]:
print('Failed to download the {}-th paper with article number {}'.format(i,arcNum))
continue
downloadUrl = result[-1].attrs['src'].split('?')[0]
response = requests.get(downloadUrl, timeout=80, headers=headers)
fname = str(ind)+'_'+downloadUrl[-12:]
ind+=1
with open(fname,'ab+') as f:
print('start download file ',fname)
f.write(response.content)