-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrap_neurIPS_papers.py
138 lines (112 loc) · 4.81 KB
/
scrap_neurIPS_papers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Quick script to scrap and save all NeurIPS papers
Author: Rohit Swami
Email: [email protected]
GitHub: https://www.github.com/rowhitswami
"""
# Importing necessary libraries
import requests
import argparse
import pandas as pd
from tqdm import tqdm
from requests.exceptions import ConnectionError
from bs4 import BeautifulSoup, SoupStrainer
# Initializing argeparse
parser = argparse.ArgumentParser(description='Script to scrap NeurIPS Papers')
parser.add_argument('-start', action="store", default=1987, dest="start_year", type=int, help='The start year to scrap the papers')
parser.add_argument('-end', action="store", default=2019, dest="end_year", type=int, help='The end year to scrap the papers')
arguments = parser.parse_args()
# Argeparse conditions
if arguments.start_year < 1987 or arguments.start_year > 2019:
raise ValueError("Please enter a valid start year. Possible values are [1987, 2019].")
if arguments.end_year < 1987 or arguments.end_year > 2019:
raise ValueError("Please enter a valid end year. Possible values are [1987, 2019].")
if arguments.start_year > arguments.end_year:
raise ValueError("Start year shouldn't be greater than end year. Possible values are [1987, 2019].")
# Constants
BASE_URL = "https://papers.nips.cc/paper/"
PARSER = 'lxml'
HEADERS = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36",
}
start_year = arguments.start_year
end_year = arguments.end_year
papers = []
paper_authors = []
def get_conference_url(start_year, end_year):
"""Return all the URLs of conferences between start_year and end_year"""
conferences = []
print("Preparing data...")
for year in tqdm(range(start_year, end_year+1)):
year_url = BASE_URL + str(year)
conferences.append({"URL": year_url})
return conferences
def get_all_hashes(url):
"""
Context: The NeurIPS website follow a structured pattern by maintaining a hash for each paper.
Return all the hashes for a particular year.
"""
try:
response = requests.get(url, headers=HEADERS)
if response.status_code == 200:
soup = BeautifulSoup(response.text, PARSER)
hashes = []
for li in soup.find("div", class_="container-fluid").find_all("li"):
paper_url = li.a.get('href')
paper_hash = paper_url.split("/")[-1].split("-")[0]
hashes.append(paper_hash)
return hashes
else:
print("Couldn't complete the request.")
return False
except ConnectionError as error:
print(error)
def scrap_paper_and_authors(year_url, hashes):
"""Scrap papers and authors using extracted hashes"""
for paper_hash in tqdm(hashes):
paper_url = year_url + "/file/" + paper_hash + "-Metadata.json"
try:
response = requests.get(paper_url, headers=HEADERS)
if response.status_code == 200:
doc = response.json()
# Extracting paper
paper = {}
paper['source_id'] = doc['sourceid']
paper['year'] = year_url.split("/")[-1]
paper['title'] = doc['title']
paper['abstract'] = doc['abstract'] if doc['abstract'] else None
paper['full_text'] = doc['full_text']
papers.append(paper)
# Extracting authors from a paper
for author in doc['authors']:
author_details = {}
author_details['source_id'] = doc['sourceid']
author_details['first_name'] = author['given_name'] if author['given_name'] else None
author_details['last_name'] = author['family_name'] if author['family_name'] else None
author_details['institution'] = author['institution'] if author['institution'] else None
paper_authors.append(author_details)
else:
print("Couldn't complete the request.")
break
except ConnectionError as error:
print(error)
def save_file(file_name, data):
"""Converting a list of dicts to Pandas dataframe and saving it as a CSV File"""
df = pd.DataFrame(data)
df.to_csv(file_name, index=False, header=True)
print(f"Successfully saved {file_name}")
# Getting all conferences
conferences = get_conference_url(start_year, end_year)
# Scrapping papers and authors
for year in conferences:
hashes = get_all_hashes(year["URL"])
if hashes:
scrap_paper_and_authors(year["URL"], hashes)
# Saving data as a CSV file
if papers and paper_authors:
save_file("papers.csv", papers)
save_file("paper_authors.csv", paper_authors)
else:
print("Couldn't save the files!")