-
Notifications
You must be signed in to change notification settings - Fork 4
/
springerminer.py
132 lines (110 loc) · 5.22 KB
/
springerminer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 7 07:30:39 2020
@author: Jie Chen
"""
# Springer Paper Miner
import urllib.request
import os
import re
import requests
from bs4 import BeautifulSoup
import json
folderpath_citations = './springer-citations/'
if not os.path.exists(folderpath_citations):
os.mkdir(folderpath_citations)
folderpath_paper = './iaied/'
if not os.path.exists(folderpath_paper):
os.mkdir(folderpath_paper)
def get_ris_springer(url):
ris_url = 'https://link.springer.com' + url + '-references.ris'
rule = r'\/([^\/]*ris)$'
ris_filename = re.findall(rule, ris_url)[0]
try:
ris_file = urllib.request.urlretrieve(ris_url, folderpath_citations + ris_filename)
except:
print('[Error] Unable to retrieve ' + ris_url)
else:
print('[Success] ' + ris_filename + ' saved')
def get_json_springer(paper_url):
page = requests.get(paper_url)
soup = BeautifulSoup(page.text, 'html.parser')
article_title = soup.find(attrs={"data-test":"article-title"}).text
author_name = [name_html.text for name_html in soup.find_all(attrs={"data-test":"author-name"})]
author_affiliation = [aff_html.meta['content'] for aff_html in soup.find_all(attrs={"itemprop":"affiliation"})]
author_address = [addr_html['content'] for addr_html in soup.find_all(attrs={"itemprop":"address"})]
author = []
if len(author_name) == len(author_affiliation) and len(author_name) == len(author_address):
count = len(author_name)
for i in range(0,count):
author.append({'author-name' : author_name[i], 'affiliation' : author_affiliation[i], 'address' : author_address[i]})
status = True
else:
status = False
print("[Warning] " + paper_url + " The numbers of authors and affliations do not match. Will store author name only. ")
count = len(author_name)
for i in range(0,count):
author.append({'author-name' : author_name[i]})
journal_title = soup.find(attrs={"data-test":"journal-title"}).text
journal_volume = "".join(soup.find(attrs={"data-test":"journal-volume"}).text.split())
pageStart = soup.find(attrs={"itemprop":"pageStart"}).text
pageEnd = soup.find(attrs={"itemprop":"pageEnd"}).text
publication_year = soup.find(attrs={"data-test":"article-publication-year"}).text
keywords = [word_html.text for word_html in soup.find_all(attrs={"itemprop":"about"})]
citation = soup.find(attrs={"class":"c-bibliographic-information__citation"}).text
try:
abstract = soup.select('#Abs1-content')[0].p.text
except:
abstract = ""
data = {
'article-title' : article_title,
'author' : author,
'journal-title' : journal_title,
'journal-volume' : journal_volume,
'pageStart' : pageStart,
'pageEnd' : pageEnd,
'publication-year' : publication_year,
'keywords' : keywords,
'citation' : citation,
'abstract' : abstract
}
jsondata = json.dumps(data)
return jsondata, status
def save_json_springer(url):
rule = r'\/([^\/]*)$'
data_filename = re.findall(rule, url)[0]
filepath = folderpath_paper + data_filename + '.json'
if os.path.exists(filepath):
print(filepath + ' already exists. Will not override.')
else:
txt = open(filepath, 'w')
paper_url = 'https://link.springer.com' + url
try:
jsondata, status = get_json_springer(paper_url)
if status == False:
txt.close()
os.remove(filepath)
filepath = folderpath_paper + data_filename + '-doublecheck.json'
txt = open(filepath, 'w')
txt.write(jsondata)
except:
print('[Error] ' + paper_url + ' not saved.')
txt.close()
os.remove(filepath)
else:
print('[Success] ' + paper_url + ' saved as ' + filepath)
def main():
print('Springer Paper Miner. \nIt is a program for mining Springer papers. This program will extract the references and paper information from all papers in a same journal, volume, and issue at one time. \n\nThe format of Springer link is \'https://link.springer.com/journal/[journal ID]/[volume]/[issue] \ne.g. International Journal of Artificial Intelligence in Education 2020, Volume 30, Issue 1. \nLink: https://link.springer.com/journal/40593/30/1\nJournal ID: 40593 \nVolume: 30 \nIssue: 1 \n\nPlease provide the journal ID, volume, and issue number. ')
journal_id = input('Journal ID: ')
volume = input('Volume: ')
issue = input('Issue: ')
journal_url = 'https://link.springer.com/journal/' + journal_id + '/' + volume + '/' + issue
print('Mining papers and citations. The papers are on https://link.springer.com/journal/' + journal_id + '/' + volume + '/' + issue)
journal_page = requests.get(journal_url)
soup = BeautifulSoup(journal_page.text, 'html.parser')
links = soup.select('#kb-nav--main div.toc ol li div h3 a')
for a in links:
href = a['href']
save_json_springer(href)
get_ris_springer(href)
print('End of the program. \nAll successfully obtained ris files have stored in \"springer-citations\" folder. ')