-
Notifications
You must be signed in to change notification settings - Fork 2
/
csranking_top_professors.py
163 lines (137 loc) · 4.93 KB
/
csranking_top_professors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import csv
import time
import argparse
from bs4 import BeautifulSoup
from prettytable import PrettyTable
from selenium import webdriver
from config import *
from utils import *
fields_dict = {
"Artificial intelligence": "ai",
"Computer vision": "vision",
"Machine learning & data mining": "mlmining",
"Natural language processing": "nlp",
"The Web & information retrieval": "ir",
"Computer architecture": "arch",
"Computer networks": "comm",
"Computer security": "sec",
"Databases": "mod",
"Design automation": "da",
"Embedded & real-time systems": "bed",
"High-performance computing": "hpc",
"Mobile computing": "mobile",
"Measurement & perf. analysis": "metrics",
"Operating systems": "ops",
"Programming languages": "plan",
"Software engineering": "soft",
"Algorithms & complexity": "act",
"Cryptography": "crypt",
"Logic & verification": "log",
"Comp. bio & bioinformatics": "bio",
"Computer graphics": "graph",
"Computer science education": "csed",
"Economics & computation": "ecom",
"Human-computer interaction": "chi",
"Robotics": "robotics",
"Visualization": "visualization",
}
def print_field_choices():
table = PrettyTable()
table.field_names = ["Field", "Code"]
for name, code in fields_dict.items():
table.add_row([name, code])
print(table)
def parse_professors(tbody):
professors = []
prof_trs = tbody.find_all("tr", recursive=False)
# Professors' info are stored in another tr list
for prof_tr in prof_trs:
professor_info = parse_professor_info(prof_tr)
if professor_info:
professors.append(professor_info)
return professors
def parse_professor_info(prof_tr):
tds = prof_tr.find_all("td")
if not tds:
return None
professor = {}
for j, td in enumerate(tds):
if j % 4 == 1:
homepage = td.find("a", title="Click for author's home page.")
if homepage:
professor["name"] = clean_text(homepage.text)
professor["home_page"] = homepage["href"]
google_scholar = td.find(
"a", title="Click for author's Google Scholar page."
)
if google_scholar:
professor["google_scholar"] = google_scholar["href"]
return professor
def fetch_universities(url):
driver = webdriver.Chrome()
driver.get(url)
time.sleep(
WAIT_TIME
) # Adjust this sleep time to avoid special situations according to your network speed.
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()
universities = []
table = soup.find("table", id="ranking")
tbody = table.find("tbody")
trs = tbody.find_all("tr", recursive=False)
for i, tr in enumerate(trs):
if i % 3 == 0:
# Parse university info
university_info = parse_university_info(tr)
if i % 3 == 2 and university_info:
# Parse professors
university_info["professors"] = parse_professors(tr.find("tbody"))
universities.append(university_info)
return universities
def parse_university_info(tr):
tds = tr.find_all("td")
if not tds:
return None
university_info = {}
for j, td in enumerate(tds):
if j % 4 == 0:
university_info["rank"] = clean_number(td.text)
if j % 4 == 1:
university_info["name"] = clean_text(td.text)
return university_info
def parse_arguments():
parser = argparse.ArgumentParser(
description="Fetch universities and professors data from CSRankings."
)
parser.add_argument(
"--fields",
type=str,
required=True,
help='Code of relevant fields, using "," to split multiple fields (e.g., "sec,ai" for Security and Artificial Intelligence)',
)
parser.add_argument(
"--start_year", type=int, default=2020, help="Start year (default 2020)"
)
parser.add_argument(
"--end_year", type=int, default=time.localtime().tm_year, help="End year (default 2024)"
)
args = parser.parse_args()
if args.start_year > args.end_year or args.end_year > time.localtime().tm_year:
parser.error("Invalid year range.")
if not all(field in set(fields_dict.values()) for field in args.fields.split(",")):
parser.error("Invalid field code.")
return (
args.fields.replace(" ", "").replace(",", "&"),
args.start_year,
args.end_year,
)
if __name__ == "__main__":
print_field_choices()
fields, from_year, to_year = parse_arguments()
url = f"https://csrankings.org/#/fromyear/{from_year}/toyear/{to_year}/index?{fields}&world"
print(f"Your URL: {url}")
universities = fetch_universities(url)
# print(universities)
filename = f'{from_year}-{to_year}-{fields.replace(" ", "").replace("&", "-")}.csv'
save_universities_to_csv(filename, universities)
print(f"Data has been saved to {filename}")